diff --git a/mlir/include/mlir/Dialect/StandardOps/Utils/Utils.h b/mlir/include/mlir/Dialect/StandardOps/Utils/Utils.h new file mode 100644 index 000000000000..8d46d9bfd278 --- /dev/null +++ b/mlir/include/mlir/Dialect/StandardOps/Utils/Utils.h @@ -0,0 +1,32 @@ +//===- Utils.h - General transformation utilities ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header file defines prototypes for various transformation utilities for +// the StandardOps dialect. These are not passes by themselves but are used +// either by passes, optimization sequences, or in turn by other transformation +// utilities. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_STANDARDOPS_UTILS_UTILS_H +#define MLIR_DIALECT_STANDARDOPS_UTILS_UTILS_H + +#include "mlir/IR/Value.h" + +namespace mlir { + +class Location; +class OpBuilder; + +/// Given an operation, retrieves the value of each dynamic dimension through +/// constructing the necessary DimOp operators. +SmallVector getDynOperands(Location loc, Value val, OpBuilder &b); + +} // end namespace mlir + +#endif // MLIR_DIALECT_STANDARDOPS_UTILS_UTILS_H diff --git a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp index a3ab6f45b26e..5df7fe982281 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp @@ -1,354 +1,343 @@ //===- Bufferize.cpp - Bufferization of linalg ops ------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "mlir/Transforms/Bufferize.h" #include "PassDetail.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/Linalg/Passes.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/StandardOps/Transforms/Passes.h" +#include "mlir/Dialect/StandardOps/Utils/Utils.h" #include "mlir/Dialect/Vector/VectorOps.h" #include "mlir/IR/BuiltinDialect.h" #include "mlir/IR/Operation.h" #include "mlir/Pass/Pass.h" using namespace ::mlir; using namespace ::mlir::linalg; -static SmallVector getDynOperands(Location loc, Value val, - OpBuilder &b) { - SmallVector dynOperands; - auto shapedType = val.getType().cast(); - for (auto dim : llvm::enumerate(shapedType.getShape())) { - if (dim.value() == TensorType::kDynamicSize) { - dynOperands.push_back(b.create(loc, val, dim.index())); - } - } - return dynOperands; -} - static Value cloneMemref(Location loc, Value memref, OpBuilder &b) { auto memrefType = memref.getType().cast(); auto alloc = b.create(loc, memrefType, getDynOperands(loc, memref, b)); b.create(loc, memref, alloc); return alloc; } static LogicalResult allocateBuffersForResults(Location loc, LinalgOp linalgOp, linalg::GenericOpAdaptor &adaptor, SmallVectorImpl &resultBuffers, OpBuilder &b) { // Lazily compute loopRanges. SmallVector loopRanges; // Allocate a buffer for every tensor result. assert(linalgOp.getNumOutputs() == linalgOp->getNumResults()); for (auto en : llvm::enumerate(linalgOp->getResultTypes())) { size_t resultIndex = en.index(); Type resultType = en.value(); auto tensorType = resultType.dyn_cast(); if (tensorType == nullptr) { linalgOp.emitOpError() << "tensor to buffer conversion expects ranked tensor results"; return failure(); } auto tensorShape = tensorType.getShape(); auto memrefType = MemRefType::get(tensorShape, tensorType.getElementType()); Value resultTensor = adaptor.outputs()[resultIndex]; // Clone output buffers whose value is actually used. if (linalgOp.payloadUsesValueFromOutputOperandIndex(resultIndex)) { resultBuffers.push_back(cloneMemref(loc, resultTensor, b)); continue; } if (auto alloc = resultTensor.getDefiningOp()) { resultBuffers.push_back(resultTensor); continue; } // Allocate buffers for statically-shaped results. if (memrefType.hasStaticShape()) { resultBuffers.push_back(b.create(loc, memrefType)); continue; } resultBuffers.push_back(b.create( loc, memrefType, getDynOperands(loc, resultTensor, b))); } return success(); } /// Specialization for `linalg::GenericOp` and `linalg::IndexedGenericOp`. /// A pattern to convert Generic Linalg operations which work on tensors to /// use buffers. BufferPlacement pass should be later used to move /// Alloc operations to the correct positions and insert the missing Dealloc /// operations in the correct places. template static void finalizeBufferAllocationForGenericOp(ConversionPatternRewriter &rewriter, GenericOpTy genericOp, ValueRange inputs, ValueRange outputs) { // Generate a new linalg operation that works on buffers. auto newGenericOp = rewriter.create( genericOp.getLoc(), /*resultTensorTypes=*/llvm::None, /*inputs=*/inputs, /*outputs=*/outputs, genericOp.indexing_maps(), genericOp.iterator_types(), genericOp.docAttr(), genericOp.library_callAttr(), genericOp.sparseAttr()); // Create a new block in the region of the new Generic Op. Block *oldBlock = genericOp.getBody(); Region &newRegion = newGenericOp.region(); Block *newBlock = rewriter.createBlock(&newRegion, newRegion.begin(), oldBlock->getArgumentTypes()); // Clone the body of the old block to the new block. BlockAndValueMapping mapping; mapping.map(oldBlock->getArguments(), newBlock->getArguments()); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPointToEnd(newBlock); for (auto &op : oldBlock->getOperations()) { Operation *clonedOp = rewriter.clone(op, mapping); mapping.map(op.getResults(), clonedOp->getResults()); } // Replace the results of the old op with the new output buffers. rewriter.replaceOp(genericOp, outputs); } /// Specialization for all other `linalg::LinalgOp`. static void finalizeBufferAllocation(ConversionPatternRewriter &rewriter, linalg::LinalgOp linalgOp, ValueRange inputs, ValueRange outputs) { assert(!isa(linalgOp.getOperation())); assert(!isa(linalgOp.getOperation())); SmallVector newOperands = inputs; newOperands.append(outputs.begin(), outputs.end()); auto otherOperands = linalgOp.getAssumedNonShapedOperands(); newOperands.append(otherOperands.begin(), otherOperands.end()); linalgOp.clone(rewriter, linalgOp.getLoc(), /*resultTypes=*/ArrayRef{}, newOperands); // Replace the results of the old op with the new output buffers. rewriter.replaceOp(linalgOp, outputs); } //===----------------------------------------------------------------------===// // Bufferization patterns. //===----------------------------------------------------------------------===// namespace { /// Generic conversion pattern that matches any LinalgOp. This avoids template /// instantiating one pattern for each LinalgOp. class BufferizeInitTensorOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(InitTensorOp op, ArrayRef operands, ConversionPatternRewriter &rewriter) const final { linalg::InitTensorOpAdaptor adaptor(operands, op->getAttrDictionary()); rewriter.replaceOpWithNewOp( op, getTypeConverter()->convertType(op.getType()).cast(), adaptor.sizes()); return success(); } }; /// Generic conversion pattern that matches any LinalgOp. This avoids template /// instantiating one pattern for each LinalgOp. class BufferizeAnyLinalgOp : public ConversionPattern { public: BufferizeAnyLinalgOp(TypeConverter &typeConverter) : ConversionPattern(/*benefit=*/1, typeConverter, MatchAnyOpTypeTag()) {} LogicalResult matchAndRewrite(Operation *op, ArrayRef operands, ConversionPatternRewriter &rewriter) const final { LinalgOp linalgOp = dyn_cast(op); if (!linalgOp) return failure(); // We abuse the GenericOpAdaptor here. // TODO: Manually create an Adaptor that captures inputs and outputs for all // linalg::LinalgOp interface ops. linalg::GenericOpAdaptor adaptor(operands, op->getAttrDictionary()); Location loc = linalgOp.getLoc(); SmallVector newOutputBuffers; if (failed(allocateBuffersForResults(loc, linalgOp, adaptor, newOutputBuffers, rewriter))) { linalgOp.emitOpError() << "Failed to allocate buffers for tensor results."; return failure(); } // Delegate to the linalg generic pattern. if (auto genericOp = dyn_cast(op)) { finalizeBufferAllocationForGenericOp( rewriter, genericOp, adaptor.inputs(), newOutputBuffers); return success(); } // Delegate to the linalg indexed generic pattern. if (auto genericOp = dyn_cast(op)) { finalizeBufferAllocationForGenericOp( rewriter, genericOp, adaptor.inputs(), newOutputBuffers); return success(); } finalizeBufferAllocation(rewriter, linalgOp, adaptor.inputs(), newOutputBuffers); return success(); } }; // Extract int64_t values from the assumed ArrayAttr of IntegerAttr. static SmallVector extractFromI64ArrayAttr(Attribute attr) { return llvm::to_vector<4>( llvm::map_range(attr.cast(), [](Attribute a) -> int64_t { return a.cast().getInt(); })); } /// Convert `subtensor %t [offsets][sizes][strides] -> %st` to an alloc + copy /// pattern. /// ``` /// %a = alloc(sizes) /// %sv = subview %source [offsets][sizes][strides] /// linalg_copy(%sv, %a) /// ``` /// /// This pattern is arguable a std pattern once linalg::CopyOp becomes /// std::CopyOp. class SubTensorOpConverter : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(SubTensorOp op, ArrayRef operands, ConversionPatternRewriter &rewriter) const final { SubTensorOpAdaptor adaptor(operands, op->getAttrDictionary()); Value sourceMemref = adaptor.source(); assert(sourceMemref.getType().isa()); MemRefType subviewMemRefType = getTypeConverter()->convertType(op.getType()).cast(); // op.sizes() capture exactly the dynamic alloc operands matching the // subviewMemRefType thanks to subview/subtensor canonicalization and // verification. Value alloc = rewriter.create(op.getLoc(), subviewMemRefType, op.sizes()); Value subView = rewriter.create( op.getLoc(), sourceMemref, extractFromI64ArrayAttr(op.static_offsets()), extractFromI64ArrayAttr(op.static_sizes()), extractFromI64ArrayAttr(op.static_strides()), op.offsets(), op.sizes(), op.strides()); rewriter.create(op.getLoc(), subView, alloc); rewriter.replaceOp(op, alloc); return success(); } }; /// Convert `subtensor_insert %source into %dest [offsets][sizes][strides] -> /// %t` to an tensor_to_memref + subview + copy + tensor_load pattern. /// tensor_to_memref and tensor_load are inserted automatically by the /// conversion infra: /// ``` /// %sv = subview %dest [offsets][sizes][strides] /// linalg_copy(%source, %sv) /// // replace with %dest /// ``` /// /// This pattern is arguable a std pattern once linalg::CopyOp becomes /// std::CopyOp. class SubTensorInsertOpConverter : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(SubTensorInsertOp op, ArrayRef operands, ConversionPatternRewriter &rewriter) const final { SubTensorInsertOpAdaptor adaptor(operands, op->getAttrDictionary()); Value sourceMemRef = adaptor.source(); assert(sourceMemRef.getType().isa()); // For now, be conservative and copy the converted input memref. // In general, the converted input memref here could be aliased or could // point into constant memory, so mutating it would lead to miscompilations. Value destMemRef = cloneMemref(op.getLoc(), adaptor.dest(), rewriter); assert(destMemRef.getType().isa()); // Take a subview to copy the small memref. Value subview = rewriter.create( op.getLoc(), destMemRef, extractFromI64ArrayAttr(op.static_offsets()), extractFromI64ArrayAttr(op.static_sizes()), extractFromI64ArrayAttr(op.static_strides()), adaptor.offsets(), adaptor.sizes(), adaptor.strides()); // Copy the small memref. rewriter.create(op.getLoc(), sourceMemRef, subview); rewriter.replaceOp(op, destMemRef); return success(); } }; } // namespace namespace { /// Converts Linalg operations that work on tensor-type operands or results to /// work on buffers. struct LinalgBufferizePass : public LinalgBufferizeBase { void runOnOperation() override { MLIRContext &context = getContext(); ConversionTarget target(context); BufferizeTypeConverter typeConverter; // Mark all Standard operations legal. target.addLegalDialect(); target.addIllegalOp(); // Mark all Linalg operations illegal as long as they work on tensors. auto isLegalOperation = [&](Operation *op) { return typeConverter.isLegal(op); }; target.addDynamicallyLegalDialect(isLegalOperation); target.addDynamicallyLegalOp(isLegalOperation); OwningRewritePatternList patterns; populateLinalgBufferizePatterns(&context, typeConverter, patterns); if (failed(applyPartialConversion(getOperation(), target, std::move(patterns)))) signalPassFailure(); } }; } // end anonymous namespace std::unique_ptr> mlir::createLinalgBufferizePass() { return std::make_unique(); } void mlir::linalg::populateLinalgBufferizePatterns( MLIRContext *context, BufferizeTypeConverter &typeConverter, OwningRewritePatternList &patterns) { patterns.insert(typeConverter); // TODO: Drop this once tensor constants work in standard. // clang-format off patterns.insert< BufferizeInitTensorOp, SubTensorOpConverter, SubTensorInsertOpConverter >(typeConverter, context); // clang-format on } diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseToLinalg.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseToLinalg.cpp index ada9f8c02b89..3012e920a82f 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseToLinalg.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseToLinalg.cpp @@ -1,154 +1,147 @@ //===- ElementwiseToLinalg.cpp - conversion of elementwise to linalg ------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Linalg/Passes.h" #include "PassDetail.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" +#include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Dialect/StandardOps/Utils/Utils.h" #include "mlir/Transforms/DialectConversion.h" using namespace mlir; static bool isElementwiseMappableOpOnRankedTensors(Operation *op) { if (!op->hasTrait()) return false; // TODO: The conversion pattern can be made to work for `any_of` here, but // it's more complex as it requires tracking which operands are scalars. return llvm::all_of(op->getOperandTypes(), [](Type type) { return type.isa(); }); } /// Given `op` assumed `isElementwiseMappableOpOnRankedTensors`, iterate over /// the result types and return a list of values such that, for each result type /// `t` and value `v` at the same index `idx`: /// 1. `v.getType() == t` /// 2. If an operand of `op` has type `t`, let `operand_first` be the first /// such operand. Then`v == operand_first`. /// 3. Otherwise, v is a newly created `linalg::InitTensorOp` with: /// a. Static and dynamic dims extracted from the first operand of `op`. /// b. Elemental type equal to the elemental type of `t`. /// /// This is sufficient because ElementwiseMappable guarantees that "The static /// types of all vector (resp. tensor) operands and results must have the same /// shape". static SmallVector getOrCreateOperandsMatchingResultTypes(OpBuilder &b, Operation *op) { assert(isElementwiseMappableOpOnRankedTensors(op)); Location loc = op->getLoc(); ValueRange operands = op->getOperands(); TypeRange rankedTensorTypes = op->getResultTypes(); SmallVector res; res.reserve(rankedTensorTypes.size()); for (Type t : rankedTensorTypes) { // Try to find an operand with type matching the result tensor. bool found = false; for (Value v : operands) { if (v.getType() == t) { found = true; res.push_back(v); break; } } if (found) continue; // Extract static / dynamic shape mix from the first operand. Value firstOperand = operands.front(); auto rankedTensorType = t.cast(); - SmallVector dynamicShape; - SmallVector staticShape; - dynamicShape.reserve(rankedTensorType.getRank()); - staticShape.reserve(rankedTensorType.getRank()); - unsigned idx = 0; - for (auto shape : rankedTensorType.getShape()) { - staticShape.push_back(shape); - if (rankedTensorType.isDynamicDim(idx)) - dynamicShape.push_back(b.create(loc, firstOperand, idx)); - ++idx; - } - // Create init tensor. + auto staticShape = llvm::to_vector<4>(rankedTensorType.getShape()); + auto dynamicShape = getDynOperands(loc, firstOperand, b); + res.push_back(b.create( loc, dynamicShape, staticShape, rankedTensorType.getElementType())); } return res; } namespace { struct ConvertAnyElementwiseMappableOpOnRankedTensors : public RewritePattern { ConvertAnyElementwiseMappableOpOnRankedTensors() : RewritePattern(/*benefit=*/1, MatchAnyOpTypeTag()) {} LogicalResult matchAndRewrite(Operation *op, PatternRewriter &rewriter) const final { if (!isElementwiseMappableOpOnRankedTensors(op)) return rewriter.notifyMatchFailure( op, "requires elementwise op on ranked tensors"); auto rank = op->getResult(0).getType().cast().getRank(); SmallVector indexingMaps( op->getNumResults() + op->getNumOperands(), rewriter.getMultiDimIdentityMap(rank)); SmallVector iteratorTypes(rank, getParallelIteratorTypeName()); auto outputs = getOrCreateOperandsMatchingResultTypes(rewriter, op); rewriter.replaceOpWithNewOp( op, /*resultTensorTypes=*/op->getResultTypes(), /*inputs=*/op->getOperands(), /*outputs=*/outputs, /*indexingMaps=*/indexingMaps, /*iteratorTypes=*/iteratorTypes, /*bodyBuilder=*/ [&](OpBuilder &builder, Location loc, ValueRange regionArgs) { OperationState state(loc, op->getName()); state.addAttributes(op->getAttrs()); // Only take the input operands in the cloned elementwise op. state.addOperands(regionArgs.take_front(op->getNumOperands())); auto resultTypes = llvm::to_vector<6>( llvm::map_range(op->getResultTypes(), [](Type type) { return type.cast().getElementType(); })); state.addTypes(resultTypes); auto *scalarOp = builder.createOperation(state); builder.create(loc, scalarOp->getResults()); }); return success(); } }; } // namespace void mlir::populateElementwiseToLinalgConversionPatterns( OwningRewritePatternList &patterns, MLIRContext *) { patterns.insert(); } namespace { class ConvertElementwiseToLinalgPass : public ConvertElementwiseToLinalgBase { void runOnFunction() final { auto func = getOperation(); auto *context = &getContext(); ConversionTarget target(*context); OwningRewritePatternList patterns; populateElementwiseToLinalgConversionPatterns(patterns, context); target.markUnknownOpDynamicallyLegal([](Operation *op) { return !isElementwiseMappableOpOnRankedTensors(op); }); if (failed(applyPartialConversion(func, target, std::move(patterns)))) signalPassFailure(); } }; } // namespace std::unique_ptr> mlir::createConvertElementwiseToLinalgPass() { return std::make_unique(); } diff --git a/mlir/lib/Dialect/StandardOps/CMakeLists.txt b/mlir/lib/Dialect/StandardOps/CMakeLists.txt index 503a3d9327ff..67f285817a91 100644 --- a/mlir/lib/Dialect/StandardOps/CMakeLists.txt +++ b/mlir/lib/Dialect/StandardOps/CMakeLists.txt @@ -1,23 +1,24 @@ add_mlir_dialect_library(MLIRStandard IR/Ops.cpp EDSC/Builders.cpp EDSC/Intrinsics.cpp + Utils/Utils.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/StandardOps DEPENDS MLIRStandardOpsIncGen LINK_LIBS PUBLIC MLIRCallInterfaces MLIRControlFlowInterfaces MLIREDSC MLIRIR MLIRSideEffectInterfaces MLIRTensor MLIRVectorInterfaces MLIRViewLikeInterface ) add_subdirectory(Transforms) diff --git a/mlir/lib/Dialect/StandardOps/Utils/Utils.cpp b/mlir/lib/Dialect/StandardOps/Utils/Utils.cpp new file mode 100644 index 000000000000..d01eb130b543 --- /dev/null +++ b/mlir/lib/Dialect/StandardOps/Utils/Utils.cpp @@ -0,0 +1,28 @@ +//===- Utils.cpp - Utilities to support the Linalg dialect ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements utilities for the Linalg dialect. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/StandardOps/Utils/Utils.h" + +#include "mlir/Dialect/StandardOps/IR/Ops.h" + +using namespace mlir; + +SmallVector mlir::getDynOperands(Location loc, Value val, + OpBuilder &b) { + SmallVector dynOperands; + auto shapedType = val.getType().cast(); + for (auto dim : llvm::enumerate(shapedType.getShape())) { + if (dim.value() == TensorType::kDynamicSize) + dynOperands.push_back(b.create(loc, val, dim.index())); + } + return dynOperands; +} diff --git a/mlir/lib/Transforms/BufferDeallocation.cpp b/mlir/lib/Transforms/BufferDeallocation.cpp index 07d67c6f3f0a..bdd8515bdc05 100644 --- a/mlir/lib/Transforms/BufferDeallocation.cpp +++ b/mlir/lib/Transforms/BufferDeallocation.cpp @@ -1,530 +1,526 @@ //===- BufferDeallocation.cpp - the impl for buffer deallocation ----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements logic for computing correct alloc and dealloc positions. // Furthermore, buffer placement also adds required new alloc and copy // operations to ensure that all buffers are deallocated. The main class is the // BufferDeallocationPass class that implements the underlying algorithm. In // order to put allocations and deallocations at safe positions, it is // significantly important to put them into the correct blocks. However, the // liveness analysis does not pay attention to aliases, which can occur due to // branches (and their associated block arguments) in general. For this purpose, // BufferDeallocation firstly finds all possible aliases for a single value // (using the BufferAliasAnalysis class). Consider the following // example: // // ^bb0(%arg0): // cond_br %cond, ^bb1, ^bb2 // ^bb1: // br ^exit(%arg0) // ^bb2: // %new_value = ... // br ^exit(%new_value) // ^exit(%arg1): // return %arg1; // // We should place the dealloc for %new_value in exit. However, we have to free // the buffer in the same block, because it cannot be freed in the post // dominator. However, this requires a new copy buffer for %arg1 that will // contain the actual contents. Using the class BufferAliasAnalysis, we // will find out that %new_value has a potential alias %arg1. In order to find // the dealloc position we have to find all potential aliases, iterate over // their uses and find the common post-dominator block (note that additional // copies and buffers remove potential aliases and will influence the placement // of the deallocs). In all cases, the computed block can be safely used to free // the %new_value buffer (may be exit or bb2) as it will die and we can use // liveness information to determine the exact operation after which we have to // insert the dealloc. However, the algorithm supports introducing copy buffers // and placing deallocs in safe locations to ensure that all buffers will be // freed in the end. // // TODO: // The current implementation does not support explicit-control-flow loops and // the resulting code will be invalid with respect to program semantics. // However, structured control-flow loops are fully supported. Furthermore, it // doesn't accept functions which return buffers already. // //===----------------------------------------------------------------------===// #include "PassDetail.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Dialect/StandardOps/Utils/Utils.h" #include "mlir/IR/Operation.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/BufferUtils.h" #include "mlir/Transforms/Passes.h" #include "llvm/ADT/SetOperations.h" using namespace mlir; /// Walks over all immediate return-like terminators in the given region. template static void walkReturnOperations(Region *region, const FuncT &func) { for (Block &block : *region) for (Operation &operation : block) { // Skip non-return-like terminators. if (operation.hasTrait()) func(&operation); } } namespace { //===----------------------------------------------------------------------===// // Backedges analysis //===----------------------------------------------------------------------===// /// A straight-forward program analysis which detects loop backedges induced by /// explicit control flow. class Backedges { public: using BlockSetT = SmallPtrSet; using BackedgeSetT = llvm::DenseSet>; public: /// Constructs a new backedges analysis using the op provided. Backedges(Operation *op) { recurse(op, op->getBlock()); } /// Returns the number of backedges formed by explicit control flow. size_t size() const { return edgeSet.size(); } /// Returns the start iterator to loop over all backedges. BackedgeSetT::const_iterator begin() const { return edgeSet.begin(); } /// Returns the end iterator to loop over all backedges. BackedgeSetT::const_iterator end() const { return edgeSet.end(); } private: /// Enters the current block and inserts a backedge into the `edgeSet` if we /// have already visited the current block. The inserted edge links the given /// `predecessor` with the `current` block. bool enter(Block ¤t, Block *predecessor) { bool inserted = visited.insert(¤t).second; if (!inserted) edgeSet.insert(std::make_pair(predecessor, ¤t)); return inserted; } /// Leaves the current block. void exit(Block ¤t) { visited.erase(¤t); } /// Recurses into the given operation while taking all attached regions into /// account. void recurse(Operation *op, Block *predecessor) { Block *current = op->getBlock(); // If the current op implements the `BranchOpInterface`, there can be // cycles in the scope of all successor blocks. if (isa(op)) { for (Block *succ : current->getSuccessors()) recurse(*succ, current); } // Recurse into all distinct regions and check for explicit control-flow // loops. for (Region ®ion : op->getRegions()) recurse(region.front(), current); } /// Recurses into explicit control-flow structures that are given by /// the successor relation defined on the block level. void recurse(Block &block, Block *predecessor) { // Try to enter the current block. If this is not possible, we are // currently processing this block and can safely return here. if (!enter(block, predecessor)) return; // Recurse into all operations and successor blocks. for (Operation &op : block.getOperations()) recurse(&op, predecessor); // Leave the current block. exit(block); } /// Stores all blocks that are currently visited and on the processing stack. BlockSetT visited; /// Stores all backedges in the format (source, target). BackedgeSetT edgeSet; }; //===----------------------------------------------------------------------===// // BufferDeallocation //===----------------------------------------------------------------------===// /// The buffer deallocation transformation which ensures that all allocs in the /// program have a corresponding de-allocation. As a side-effect, it might also /// introduce copies that in turn leads to additional allocs and de-allocations. class BufferDeallocation : BufferPlacementTransformationBase { public: BufferDeallocation(Operation *op) : BufferPlacementTransformationBase(op), dominators(op), postDominators(op) {} /// Performs the actual placement/creation of all temporary alloc, copy and /// dealloc nodes. void deallocate() { // Add additional allocations and copies that are required. introduceCopies(); // Place deallocations for all allocation entries. placeDeallocs(); } private: /// Introduces required allocs and copy operations to avoid memory leaks. void introduceCopies() { // Initialize the set of values that require a dedicated memory free // operation since their operands cannot be safely deallocated in a post // dominator. SmallPtrSet valuesToFree; llvm::SmallDenseSet> visitedValues; SmallVector, 8> toProcess; // Check dominance relation for proper dominance properties. If the given // value node does not dominate an alias, we will have to create a copy in // order to free all buffers that can potentially leak into a post // dominator. auto findUnsafeValues = [&](Value source, Block *definingBlock) { auto it = aliases.find(source); if (it == aliases.end()) return; for (Value value : it->second) { if (valuesToFree.count(value) > 0) continue; Block *parentBlock = value.getParentBlock(); // Check whether we have to free this particular block argument or // generic value. We have to free the current alias if it is either // defined in a non-dominated block or it is defined in the same block // but the current value is not dominated by the source value. if (!dominators.dominates(definingBlock, parentBlock) || (definingBlock == parentBlock && value.isa())) { toProcess.emplace_back(value, parentBlock); valuesToFree.insert(value); } else if (visitedValues.insert(std::make_tuple(value, definingBlock)) .second) toProcess.emplace_back(value, definingBlock); } }; // Detect possibly unsafe aliases starting from all allocations. for (BufferPlacementAllocs::AllocEntry &entry : allocs) { Value allocValue = std::get<0>(entry); findUnsafeValues(allocValue, allocValue.getDefiningOp()->getBlock()); } // Try to find block arguments that require an explicit free operation // until we reach a fix point. while (!toProcess.empty()) { auto current = toProcess.pop_back_val(); findUnsafeValues(std::get<0>(current), std::get<1>(current)); } // Update buffer aliases to ensure that we free all buffers and block // arguments at the correct locations. aliases.remove(valuesToFree); // Add new allocs and additional copy operations. for (Value value : valuesToFree) { if (auto blockArg = value.dyn_cast()) introduceBlockArgCopy(blockArg); else introduceValueCopyForRegionResult(value); // Register the value to require a final dealloc. Note that we do not have // to assign a block here since we do not want to move the allocation node // to another location. allocs.registerAlloc(std::make_tuple(value, nullptr)); } } /// Introduces temporary allocs in all predecessors and copies the source /// values into the newly allocated buffers. void introduceBlockArgCopy(BlockArgument blockArg) { // Allocate a buffer for the current block argument in the block of // the associated value (which will be a predecessor block by // definition). Block *block = blockArg.getOwner(); for (auto it = block->pred_begin(), e = block->pred_end(); it != e; ++it) { // Get the terminator and the value that will be passed to our // argument. Operation *terminator = (*it)->getTerminator(); auto branchInterface = cast(terminator); // Query the associated source value. Value sourceValue = branchInterface.getSuccessorOperands(it.getSuccessorIndex()) .getValue()[blockArg.getArgNumber()]; // Create a new alloc and copy at the current location of the terminator. Value alloc = introduceBufferCopy(sourceValue, terminator); // Wire new alloc and successor operand. auto mutableOperands = branchInterface.getMutableSuccessorOperands(it.getSuccessorIndex()); if (!mutableOperands.hasValue()) terminator->emitError() << "terminators with immutable successor " "operands are not supported"; else mutableOperands.getValue() .slice(blockArg.getArgNumber(), 1) .assign(alloc); } // Check whether the block argument has implicitly defined predecessors via // the RegionBranchOpInterface. This can be the case if the current block // argument belongs to the first block in a region and the parent operation // implements the RegionBranchOpInterface. Region *argRegion = block->getParent(); Operation *parentOp = argRegion->getParentOp(); RegionBranchOpInterface regionInterface; if (!argRegion || &argRegion->front() != block || !(regionInterface = dyn_cast(parentOp))) return; introduceCopiesForRegionSuccessors( regionInterface, argRegion->getParentOp()->getRegions(), blockArg, [&](RegionSuccessor &successorRegion) { // Find a predecessor of our argRegion. return successorRegion.getSuccessor() == argRegion; }); // Check whether the block argument belongs to an entry region of the // parent operation. In this case, we have to introduce an additional copy // for buffer that is passed to the argument. SmallVector successorRegions; regionInterface.getSuccessorRegions(/*index=*/llvm::None, successorRegions); auto *it = llvm::find_if(successorRegions, [&](RegionSuccessor &successorRegion) { return successorRegion.getSuccessor() == argRegion; }); if (it == successorRegions.end()) return; // Determine the actual operand to introduce a copy for and rewire the // operand to point to the copy instead. Value operand = regionInterface.getSuccessorEntryOperands(argRegion->getRegionNumber()) [llvm::find(it->getSuccessorInputs(), blockArg).getIndex()]; Value copy = introduceBufferCopy(operand, parentOp); auto op = llvm::find(parentOp->getOperands(), operand); assert(op != parentOp->getOperands().end() && "parentOp does not contain operand"); parentOp->setOperand(op.getIndex(), copy); } /// Introduces temporary allocs in front of all associated nested-region /// terminators and copies the source values into the newly allocated buffers. void introduceValueCopyForRegionResult(Value value) { // Get the actual result index in the scope of the parent terminator. Operation *operation = value.getDefiningOp(); auto regionInterface = cast(operation); // Filter successors that return to the parent operation. auto regionPredicate = [&](RegionSuccessor &successorRegion) { // If the RegionSuccessor has no associated successor, it will return to // its parent operation. return !successorRegion.getSuccessor(); }; // Introduce a copy for all region "results" that are returned to the parent // operation. This is required since the parent's result value has been // considered critical. Therefore, the algorithm assumes that a copy of a // previously allocated buffer is returned by the operation (like in the // case of a block argument). introduceCopiesForRegionSuccessors(regionInterface, operation->getRegions(), value, regionPredicate); } /// Introduces buffer copies for all terminators in the given regions. The /// regionPredicate is applied to every successor region in order to restrict /// the copies to specific regions. template void introduceCopiesForRegionSuccessors( RegionBranchOpInterface regionInterface, MutableArrayRef regions, Value argValue, const TPredicate ®ionPredicate) { for (Region ®ion : regions) { // Query the regionInterface to get all successor regions of the current // one. SmallVector successorRegions; regionInterface.getSuccessorRegions(region.getRegionNumber(), successorRegions); // Try to find a matching region successor. RegionSuccessor *regionSuccessor = llvm::find_if(successorRegions, regionPredicate); if (regionSuccessor == successorRegions.end()) continue; // Get the operand index in the context of the current successor input // bindings. size_t operandIndex = llvm::find(regionSuccessor->getSuccessorInputs(), argValue) .getIndex(); // Iterate over all immediate terminator operations to introduce // new buffer allocations. Thereby, the appropriate terminator operand // will be adjusted to point to the newly allocated buffer instead. walkReturnOperations(®ion, [&](Operation *terminator) { // Extract the source value from the current terminator. Value sourceValue = terminator->getOperand(operandIndex); // Create a new alloc at the current location of the terminator. Value alloc = introduceBufferCopy(sourceValue, terminator); // Wire alloc and terminator operand. terminator->setOperand(operandIndex, alloc); }); } } /// Creates a new memory allocation for the given source value and copies /// its content into the newly allocated buffer. The terminator operation is /// used to insert the alloc and copy operations at the right places. Value introduceBufferCopy(Value sourceValue, Operation *terminator) { // Avoid multiple copies of the same source value. This can happen in the // presence of loops when a branch acts as a backedge while also having // another successor that returns to its parent operation. Note: that // copying copied buffers can introduce memory leaks since the invariant of // BufferPlacement assumes that a buffer will be only copied once into a // temporary buffer. Hence, the construction of copy chains introduces // additional allocations that are not tracked automatically by the // algorithm. if (copiedValues.contains(sourceValue)) return sourceValue; // Create a new alloc at the current location of the terminator. auto memRefType = sourceValue.getType().cast(); OpBuilder builder(terminator); // Extract information about dynamically shaped types by // extracting their dynamic dimensions. - SmallVector dynamicOperands; - for (auto shapeElement : llvm::enumerate(memRefType.getShape())) { - if (!ShapedType::isDynamic(shapeElement.value())) - continue; - dynamicOperands.push_back(builder.create( - terminator->getLoc(), sourceValue, shapeElement.index())); - } + auto dynamicOperands = + getDynOperands(terminator->getLoc(), sourceValue, builder); // TODO: provide a generic interface to create dialect-specific // Alloc and CopyOp nodes. auto alloc = builder.create(terminator->getLoc(), memRefType, dynamicOperands); // Create a new copy operation that copies to contents of the old // allocation to the new one. builder.create(terminator->getLoc(), sourceValue, alloc); // Remember the copy of original source value. copiedValues.insert(alloc); return alloc; } /// Finds correct dealloc positions according to the algorithm described at /// the top of the file for all alloc nodes and block arguments that can be /// handled by this analysis. void placeDeallocs() const { // Move or insert deallocs using the previously computed information. // These deallocations will be linked to their associated allocation nodes // since they don't have any aliases that can (potentially) increase their // liveness. for (const BufferPlacementAllocs::AllocEntry &entry : allocs) { Value alloc = std::get<0>(entry); auto aliasesSet = aliases.resolve(alloc); assert(aliasesSet.size() > 0 && "must contain at least one alias"); // Determine the actual block to place the dealloc and get liveness // information. Block *placementBlock = findCommonDominator(alloc, aliasesSet, postDominators); const LivenessBlockInfo *livenessInfo = liveness.getLiveness(placementBlock); // We have to ensure that the dealloc will be after the last use of all // aliases of the given value. We first assume that there are no uses in // the placementBlock and that we can safely place the dealloc at the // beginning. Operation *endOperation = &placementBlock->front(); // Iterate over all aliases and ensure that the endOperation will point // to the last operation of all potential aliases in the placementBlock. for (Value alias : aliasesSet) { // Ensure that the start operation is at least the defining operation of // the current alias to avoid invalid placement of deallocs for aliases // without any uses. Operation *beforeOp = endOperation; if (alias.getDefiningOp() && !(beforeOp = placementBlock->findAncestorOpInBlock( *alias.getDefiningOp()))) continue; Operation *aliasEndOperation = livenessInfo->getEndOperation(alias, beforeOp); // Check whether the aliasEndOperation lies in the desired block and // whether it is behind the current endOperation. If yes, this will be // the new endOperation. if (aliasEndOperation->getBlock() == placementBlock && endOperation->isBeforeInBlock(aliasEndOperation)) endOperation = aliasEndOperation; } // endOperation is the last operation behind which we can safely store // the dealloc taking all potential aliases into account. // If there is an existing dealloc, move it to the right place. Operation *deallocOperation = std::get<1>(entry); if (deallocOperation) { deallocOperation->moveAfter(endOperation); } else { // If the Dealloc position is at the terminator operation of the // block, then the value should escape from a deallocation. Operation *nextOp = endOperation->getNextNode(); if (!nextOp) continue; // If there is no dealloc node, insert one in the right place. OpBuilder builder(nextOp); builder.create(alloc.getLoc(), alloc); } } } /// The dominator info to find the appropriate start operation to move the /// allocs. DominanceInfo dominators; /// The post dominator info to move the dependent allocs in the right /// position. PostDominanceInfo postDominators; /// Stores already copied allocations to avoid additional copies of copies. ValueSetT copiedValues; }; //===----------------------------------------------------------------------===// // BufferDeallocationPass //===----------------------------------------------------------------------===// /// The actual buffer deallocation pass that inserts and moves dealloc nodes /// into the right positions. Furthermore, it inserts additional allocs and /// copies if necessary. It uses the algorithm described at the top of the file. struct BufferDeallocationPass : BufferDeallocationBase { void runOnFunction() override { // Ensure that there are supported loops only. Backedges backedges(getFunction()); if (backedges.size()) { getFunction().emitError( "Structured control-flow loops are supported only."); return; } // Place all required temporary alloc, copy and dealloc nodes. BufferDeallocation deallocation(getFunction()); deallocation.deallocate(); } }; } // end anonymous namespace //===----------------------------------------------------------------------===// // BufferDeallocationPass construction //===----------------------------------------------------------------------===// std::unique_ptr mlir::createBufferDeallocationPass() { return std::make_unique(); } diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp index 564193e22690..025b1b217476 100644 --- a/mlir/lib/Transforms/PipelineDataTransfer.cpp +++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp @@ -1,368 +1,364 @@ //===- PipelineDataTransfer.cpp --- Pass for pipelining data movement ---*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements a pass to pipeline data transfers. // //===----------------------------------------------------------------------===// #include "PassDetail.h" #include "mlir/Transforms/Passes.h" #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/StandardOps/Utils/Utils.h" #include "mlir/IR/Builders.h" #include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Utils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "affine-pipeline-data-transfer" using namespace mlir; namespace { struct PipelineDataTransfer : public AffinePipelineDataTransferBase { void runOnFunction() override; void runOnAffineForOp(AffineForOp forOp); std::vector forOps; }; } // end anonymous namespace /// Creates a pass to pipeline explicit movement of data across levels of the /// memory hierarchy. std::unique_ptr> mlir::createPipelineDataTransferPass() { return std::make_unique(); } // Returns the position of the tag memref operand given a DMA operation. // Temporary utility: will be replaced when DmaStart/DmaFinish abstract op's are // added. TODO static unsigned getTagMemRefPos(Operation &dmaOp) { assert((isa(dmaOp))); if (auto dmaStartOp = dyn_cast(dmaOp)) { return dmaStartOp.getTagMemRefOperandIndex(); } // First operand for a dma finish operation. return 0; } /// Doubles the buffer of the supplied memref on the specified 'affine.for' /// operation by adding a leading dimension of size two to the memref. /// Replaces all uses of the old memref by the new one while indexing the newly /// added dimension by the loop IV of the specified 'affine.for' operation /// modulo 2. Returns false if such a replacement cannot be performed. static bool doubleBuffer(Value oldMemRef, AffineForOp forOp) { auto *forBody = forOp.getBody(); OpBuilder bInner(forBody, forBody->begin()); // Doubles the shape with a leading dimension extent of 2. auto doubleShape = [&](MemRefType oldMemRefType) -> MemRefType { // Add the leading dimension in the shape for the double buffer. ArrayRef oldShape = oldMemRefType.getShape(); SmallVector newShape(1 + oldMemRefType.getRank()); newShape[0] = 2; std::copy(oldShape.begin(), oldShape.end(), newShape.begin() + 1); return MemRefType::Builder(oldMemRefType) .setShape(newShape) .setAffineMaps({}); }; auto oldMemRefType = oldMemRef.getType().cast(); auto newMemRefType = doubleShape(oldMemRefType); // The double buffer is allocated right before 'forOp'. OpBuilder bOuter(forOp); // Put together alloc operands for any dynamic dimensions of the memref. - SmallVector allocOperands; - unsigned dynamicDimCount = 0; - for (auto dimSize : oldMemRefType.getShape()) { - if (dimSize == -1) - allocOperands.push_back( - bOuter.create(forOp.getLoc(), oldMemRef, dynamicDimCount++)); - } + + auto allocOperands = getDynOperands(forOp.getLoc(), oldMemRef, bOuter); // Create and place the alloc right before the 'affine.for' operation. Value newMemRef = bOuter.create(forOp.getLoc(), newMemRefType, allocOperands); // Create 'iv mod 2' value to index the leading dimension. auto d0 = bInner.getAffineDimExpr(0); int64_t step = forOp.getStep(); auto modTwoMap = AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, d0.floorDiv(step) % 2); auto ivModTwoOp = bInner.create(forOp.getLoc(), modTwoMap, forOp.getInductionVar()); // replaceAllMemRefUsesWith will succeed unless the forOp body has // non-dereferencing uses of the memref (dealloc's are fine though). if (failed(replaceAllMemRefUsesWith( oldMemRef, newMemRef, /*extraIndices=*/{ivModTwoOp}, /*indexRemap=*/AffineMap(), /*extraOperands=*/{}, /*symbolOperands=*/{}, /*domInstFilter=*/&*forOp.getBody()->begin()))) { LLVM_DEBUG( forOp.emitError("memref replacement for double buffering failed")); ivModTwoOp.erase(); return false; } // Insert the dealloc op right after the for loop. bOuter.setInsertionPointAfter(forOp); bOuter.create(forOp.getLoc(), newMemRef); return true; } /// Returns success if the IR is in a valid state. void PipelineDataTransfer::runOnFunction() { // Do a post order walk so that inner loop DMAs are processed first. This is // necessary since 'affine.for' operations nested within would otherwise // become invalid (erased) when the outer loop is pipelined (the pipelined one // gets deleted and replaced by a prologue, a new steady-state loop and an // epilogue). forOps.clear(); getFunction().walk([&](AffineForOp forOp) { forOps.push_back(forOp); }); for (auto forOp : forOps) runOnAffineForOp(forOp); } // Check if tags of the dma start op and dma wait op match. static bool checkTagMatch(AffineDmaStartOp startOp, AffineDmaWaitOp waitOp) { if (startOp.getTagMemRef() != waitOp.getTagMemRef()) return false; auto startIndices = startOp.getTagIndices(); auto waitIndices = waitOp.getTagIndices(); // Both of these have the same number of indices since they correspond to the // same tag memref. for (auto it = startIndices.begin(), wIt = waitIndices.begin(), e = startIndices.end(); it != e; ++it, ++wIt) { // Keep it simple for now, just checking if indices match. // TODO: this would in general need to check if there is no // intervening write writing to the same tag location, i.e., memory last // write/data flow analysis. This is however sufficient/powerful enough for // now since the DMA generation pass or the input for it will always have // start/wait with matching tags (same SSA operand indices). if (*it != *wIt) return false; } return true; } // Identify matching DMA start/finish operations to overlap computation with. static void findMatchingStartFinishInsts( AffineForOp forOp, SmallVectorImpl> &startWaitPairs) { // Collect outgoing DMA operations - needed to check for dependences below. SmallVector outgoingDmaOps; for (auto &op : *forOp.getBody()) { auto dmaStartOp = dyn_cast(op); if (dmaStartOp && dmaStartOp.isSrcMemorySpaceFaster()) outgoingDmaOps.push_back(dmaStartOp); } SmallVector dmaStartInsts, dmaFinishInsts; for (auto &op : *forOp.getBody()) { // Collect DMA finish operations. if (isa(op)) { dmaFinishInsts.push_back(&op); continue; } auto dmaStartOp = dyn_cast(op); if (!dmaStartOp) continue; // Only DMAs incoming into higher memory spaces are pipelined for now. // TODO: handle outgoing DMA pipelining. if (!dmaStartOp.isDestMemorySpaceFaster()) continue; // Check for dependence with outgoing DMAs. Doing this conservatively. // TODO: use the dependence analysis to check for // dependences between an incoming and outgoing DMA in the same iteration. auto it = outgoingDmaOps.begin(); for (; it != outgoingDmaOps.end(); ++it) { if (it->getDstMemRef() == dmaStartOp.getSrcMemRef()) break; } if (it != outgoingDmaOps.end()) continue; // We only double buffer if the buffer is not live out of loop. auto memref = dmaStartOp.getOperand(dmaStartOp.getFasterMemPos()); bool escapingUses = false; for (auto *user : memref.getUsers()) { // We can double buffer regardless of dealloc's outside the loop. if (isa(user)) continue; if (!forOp.getBody()->findAncestorOpInBlock(*user)) { LLVM_DEBUG(llvm::dbgs() << "can't pipeline: buffer is live out of loop\n";); escapingUses = true; break; } } if (!escapingUses) dmaStartInsts.push_back(&op); } // For each start operation, we look for a matching finish operation. for (auto *dmaStartOp : dmaStartInsts) { for (auto *dmaFinishOp : dmaFinishInsts) { if (checkTagMatch(cast(dmaStartOp), cast(dmaFinishOp))) { startWaitPairs.push_back({dmaStartOp, dmaFinishOp}); break; } } } } /// Overlap DMA transfers with computation in this loop. If successful, /// 'forOp' is deleted, and a prologue, a new pipelined loop, and epilogue are /// inserted right before where it was. void PipelineDataTransfer::runOnAffineForOp(AffineForOp forOp) { auto mayBeConstTripCount = getConstantTripCount(forOp); if (!mayBeConstTripCount.hasValue()) { LLVM_DEBUG(forOp.emitRemark("won't pipeline due to unknown trip count")); return; } SmallVector, 4> startWaitPairs; findMatchingStartFinishInsts(forOp, startWaitPairs); if (startWaitPairs.empty()) { LLVM_DEBUG(forOp.emitRemark("No dma start/finish pairs\n")); return; } // Double the buffers for the higher memory space memref's. // Identify memref's to replace by scanning through all DMA start // operations. A DMA start operation has two memref's - the one from the // higher level of memory hierarchy is the one to double buffer. // TODO: check whether double-buffering is even necessary. // TODO: make this work with different layouts: assuming here that // the dimension we are adding here for the double buffering is the outermost // dimension. for (auto &pair : startWaitPairs) { auto *dmaStartOp = pair.first; Value oldMemRef = dmaStartOp->getOperand( cast(dmaStartOp).getFasterMemPos()); if (!doubleBuffer(oldMemRef, forOp)) { // Normally, double buffering should not fail because we already checked // that there are no uses outside. LLVM_DEBUG(llvm::dbgs() << "double buffering failed for" << dmaStartOp << "\n";); // IR still valid and semantically correct. return; } // If the old memref has no more uses, remove its 'dead' alloc if it was // alloc'ed. (note: DMA buffers are rarely function live-in; but a 'dim' // operation could have been used on it if it was dynamically shaped in // order to create the double buffer above.) // '-canonicalize' does this in a more general way, but we'll anyway do the // simple/common case so that the output / test cases looks clear. if (auto *allocOp = oldMemRef.getDefiningOp()) { if (oldMemRef.use_empty()) { allocOp->erase(); } else if (oldMemRef.hasOneUse()) { if (auto dealloc = dyn_cast(*oldMemRef.user_begin())) { dealloc.erase(); allocOp->erase(); } } } } // Double the buffers for tag memrefs. for (auto &pair : startWaitPairs) { auto *dmaFinishOp = pair.second; Value oldTagMemRef = dmaFinishOp->getOperand(getTagMemRefPos(*dmaFinishOp)); if (!doubleBuffer(oldTagMemRef, forOp)) { LLVM_DEBUG(llvm::dbgs() << "tag double buffering failed\n";); return; } // If the old tag has no uses or a single dealloc use, remove it. // (canonicalization handles more complex cases). if (auto *tagAllocOp = oldTagMemRef.getDefiningOp()) { if (oldTagMemRef.use_empty()) { tagAllocOp->erase(); } else if (oldTagMemRef.hasOneUse()) { if (auto dealloc = dyn_cast(*oldTagMemRef.user_begin())) { dealloc.erase(); tagAllocOp->erase(); } } } } // Double buffering would have invalidated all the old DMA start/wait insts. startWaitPairs.clear(); findMatchingStartFinishInsts(forOp, startWaitPairs); // Store shift for operation for later lookup for AffineApplyOp's. DenseMap instShiftMap; for (auto &pair : startWaitPairs) { auto *dmaStartOp = pair.first; assert(isa(dmaStartOp)); instShiftMap[dmaStartOp] = 0; // Set shifts for DMA start op's affine operand computation slices to 0. SmallVector sliceOps; mlir::createAffineComputationSlice(dmaStartOp, &sliceOps); if (!sliceOps.empty()) { for (auto sliceOp : sliceOps) { instShiftMap[sliceOp.getOperation()] = 0; } } else { // If a slice wasn't created, the reachable affine.apply op's from its // operands are the ones that go with it. SmallVector affineApplyInsts; SmallVector operands(dmaStartOp->getOperands()); getReachableAffineApplyOps(operands, affineApplyInsts); for (auto *op : affineApplyInsts) { instShiftMap[op] = 0; } } } // Everything else (including compute ops and dma finish) are shifted by one. for (auto &op : forOp.getBody()->without_terminator()) if (instShiftMap.find(&op) == instShiftMap.end()) instShiftMap[&op] = 1; // Get shifts stored in map. SmallVector shifts(forOp.getBody()->getOperations().size()); unsigned s = 0; for (auto &op : forOp.getBody()->without_terminator()) { assert(instShiftMap.find(&op) != instShiftMap.end()); shifts[s++] = instShiftMap[&op]; // Tagging operations with shifts for debugging purposes. LLVM_DEBUG({ OpBuilder b(&op); op.setAttr("shift", b.getI64IntegerAttr(shifts[s - 1])); }); } if (!isOpwiseShiftValid(forOp, shifts)) { // Violates dependences. LLVM_DEBUG(llvm::dbgs() << "Shifts invalid - unexpected\n";); return; } if (failed(affineForOpBodySkew(forOp, shifts))) { LLVM_DEBUG(llvm::dbgs() << "op body skewing failed - unexpected\n";); return; } }