Index: clang/docs/tools/clang-formatted-files.txt =================================================================== --- clang/docs/tools/clang-formatted-files.txt +++ clang/docs/tools/clang-formatted-files.txt @@ -2301,6 +2301,7 @@ flang/lib/Optimizer/Transforms/MemRefDataFlowOpt.cpp flang/lib/Optimizer/Transforms/PassDetail.h flang/lib/Optimizer/Transforms/RewriteLoop.cpp +flang/lib/Optimizer/Transforms/StackArrays.cpp flang/lib/Parser/basic-parsers.h flang/lib/Parser/char-block.cpp flang/lib/Parser/char-buffer.cpp Index: flang/include/flang/Optimizer/Builder/MutableBox.h =================================================================== --- flang/include/flang/Optimizer/Builder/MutableBox.h +++ flang/include/flang/Optimizer/Builder/MutableBox.h @@ -123,8 +123,8 @@ void genInlinedAllocation(fir::FirOpBuilder &builder, mlir::Location loc, const fir::MutableBoxValue &box, mlir::ValueRange lbounds, mlir::ValueRange extents, - mlir::ValueRange lenParams, - llvm::StringRef allocName); + mlir::ValueRange lenParams, llvm::StringRef allocName, + bool mustBeHeap = false); void genInlinedDeallocate(fir::FirOpBuilder &builder, mlir::Location loc, const fir::MutableBoxValue &box); Index: flang/include/flang/Optimizer/Dialect/FIRAttr.h =================================================================== --- flang/include/flang/Optimizer/Dialect/FIRAttr.h +++ flang/include/flang/Optimizer/Dialect/FIRAttr.h @@ -57,6 +57,15 @@ mlir::Type getType() const; }; +/// Attribute which can be applied to a fir.allocmem operation, specifying that +/// the allocation may not be moved to the heap by passes +class MustBeHeapAttr : public mlir::BoolAttr { +public: + using BoolAttr::BoolAttr; + + static constexpr llvm::StringRef getAttrName() { return "fir.must_be_heap"; } +}; + // Attributes for building SELECT CASE multiway branches /// A closed interval (including the bound values) is an interval with both an Index: flang/include/flang/Optimizer/Transforms/Passes.h =================================================================== --- flang/include/flang/Optimizer/Transforms/Passes.h +++ flang/include/flang/Optimizer/Transforms/Passes.h @@ -54,6 +54,7 @@ std::unique_ptr createMemDataFlowOptPass(); std::unique_ptr createPromoteToAffinePass(); std::unique_ptr createMemoryAllocationPass(); +std::unique_ptr createStackArraysPass(); std::unique_ptr createSimplifyIntrinsicsPass(); std::unique_ptr Index: flang/include/flang/Optimizer/Transforms/Passes.td =================================================================== --- flang/include/flang/Optimizer/Transforms/Passes.td +++ flang/include/flang/Optimizer/Transforms/Passes.td @@ -216,6 +216,16 @@ let constructor = "::fir::createMemoryAllocationPass()"; } +def StackArrays : Pass<"stack-arrays", "mlir::func::FuncOp"> { + let summary = "Move local array allocations from heap memory into stack memory"; + let description = [{ + Convert heap allocations for arrays, even those of unknown size, into stack + allocations. + }]; + let dependentDialects = [ "fir::FIROpsDialect" ]; + let constructor = "::fir::createStackArraysPass()"; +} + def SimplifyRegionLite : Pass<"simplify-region-lite", "mlir::ModuleOp"> { let summary = "Region simplification"; let description = [{ Index: flang/lib/Lower/Allocatable.cpp =================================================================== --- flang/lib/Lower/Allocatable.cpp +++ flang/lib/Lower/Allocatable.cpp @@ -365,7 +365,8 @@ } } fir::factory::genInlinedAllocation(builder, loc, box, lbounds, extents, - lenParams, mangleAlloc(alloc)); + lenParams, mangleAlloc(alloc), + /*mustBeHeap=*/true); } void genSimpleAllocation(const Allocation &alloc, Index: flang/lib/Optimizer/Builder/MutableBox.cpp =================================================================== --- flang/lib/Optimizer/Builder/MutableBox.cpp +++ flang/lib/Optimizer/Builder/MutableBox.cpp @@ -16,6 +16,7 @@ #include "flang/Optimizer/Builder/Runtime/Derived.h" #include "flang/Optimizer/Builder/Runtime/Stop.h" #include "flang/Optimizer/Builder/Todo.h" +#include "flang/Optimizer/Dialect/FIRAttr.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" #include "flang/Optimizer/Support/FatalError.h" @@ -710,13 +711,11 @@ return newStorage; } -void fir::factory::genInlinedAllocation(fir::FirOpBuilder &builder, - mlir::Location loc, - const fir::MutableBoxValue &box, - mlir::ValueRange lbounds, - mlir::ValueRange extents, - mlir::ValueRange lenParams, - llvm::StringRef allocName) { +void fir::factory::genInlinedAllocation( + fir::FirOpBuilder &builder, mlir::Location loc, + const fir::MutableBoxValue &box, mlir::ValueRange lbounds, + mlir::ValueRange extents, mlir::ValueRange lenParams, + llvm::StringRef allocName, bool mustBeHeap) { auto lengths = getNewLengths(builder, loc, box, lenParams); llvm::SmallVector safeExtents; for (mlir::Value extent : extents) @@ -733,6 +732,9 @@ mlir::Value irBox = fir::factory::getMutableIRBox(builder, loc, box); fir::runtime::genDerivedTypeInitialize(builder, loc, irBox); } + + heap->setAttr(fir::MustBeHeapAttr::getAttrName(), + fir::MustBeHeapAttr::get(builder.getContext(), mustBeHeap)); } void fir::factory::genInlinedDeallocate(fir::FirOpBuilder &builder, Index: flang/lib/Optimizer/Transforms/CMakeLists.txt =================================================================== --- flang/lib/Optimizer/Transforms/CMakeLists.txt +++ flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -8,6 +8,7 @@ ArrayValueCopy.cpp ExternalNameConversion.cpp MemoryAllocation.cpp + StackArrays.cpp MemRefDataFlowOpt.cpp SimplifyRegionLite.cpp AlgebraicSimplification.cpp Index: flang/lib/Optimizer/Transforms/StackArrays.cpp =================================================================== --- /dev/null +++ flang/lib/Optimizer/Transforms/StackArrays.cpp @@ -0,0 +1,550 @@ +//===- StackArrays.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Dialect/FIRAttr.h" +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Transforms/Passes.h" +#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" +#include "mlir/Analysis/DataFlow/DenseAnalysis.h" +#include "mlir/Analysis/DataFlowFramework.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Diagnostics.h" +#include "mlir/IR/Value.h" +#include "mlir/Interfaces/LoopLikeInterface.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/Passes.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" + +namespace fir { +#define GEN_PASS_DEF_STACKARRAYS +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir + +#define DEBUG_TYPE "stack-arrays" + +namespace { + +/// The state of an SSA value at each program point +enum class AllocationState { + /// This means that the allocation state of a variable cannot be determined + /// at this program point, e.g. because one route through a conditional freed + /// the variable and the other route didn't. + /// This asserts a known-unknown: different from the unknown-unknown of having + /// no AllocationState stored for a particular SSA value + Unknown, + /// Means this SSA value was allocated on the heap in this function and has + /// now been freed + Freed, + /// Means this SSA value was allocated on the heap in this function and is a + /// candidate for moving to the stack + Allocated, +}; + +/// Maps SSA values to their AllocationState at a particular program point. +/// Also caches the insertion points for the new alloca operations +class LatticePoint : public mlir::dataflow::AbstractDenseLattice { + // Maps all values we are interested in to states + llvm::SmallDenseMap stateMap; + + // Maps only Freed values to the possible insertion point for an alloca + // operation. It feels a bit backwards calculating that here, but there are + // cases where we fail to find a valid insertion point and MLIR needs to know + // which operations we intend to rewrite before it calls the pattern rewriter + llvm::SmallDenseMap insertionPoints; + +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LatticePoint) + using AbstractDenseLattice::AbstractDenseLattice; + + bool operator==(const LatticePoint &rhs) const { + return stateMap == rhs.stateMap && insertionPoints == rhs.insertionPoints; + } + + /// Join the lattice accross control-flow edges + mlir::ChangeResult join(const AbstractDenseLattice &lattice) override; + + void print(llvm::raw_ostream &os) const override; + + /// Clear all modifications + mlir::ChangeResult reset(); + + /// Set the state of an SSA value + mlir::ChangeResult set(mlir::Value value, AllocationState state); + + /// Get fir.allocmem ops which were allocated in this function and always + /// freed before the function returns, plus whre to insert replacement + /// fir.alloca ops. The map is fir.allocmem -> (insert location) + void appendFreedValues( + llvm::DenseMap &out) const; + + std::optional get(mlir::Value val) const; +}; + +class AllocationAnalysis + : public mlir::dataflow::DenseDataFlowAnalysis { +public: + using DenseDataFlowAnalysis::DenseDataFlowAnalysis; + + void visitOperation(mlir::Operation *op, const LatticePoint &before, + LatticePoint *after) override; + + /// At an entry point, the last modifications of all memory resources are + /// yet to be determined + void setToEntryState(LatticePoint *lattice) override; + +protected: + /// Visit control flow operations and decide whether to call visitOperation + /// to apply the transfer function + void visitOperationCFG(mlir::Operation *op) override; +}; + +/// Drives analysis to find candidate fir.allocmem operations which could be +/// moved to the stack. Intended to be used with mlir::Pass::getAnalysis +class StackArraysAnalysisWrapper { +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(StackArraysAnalysisWrapper) + + StackArraysAnalysisWrapper(mlir::Operation *op); + + bool hasErrors() const; + + const llvm::DenseMap & + getCandidateOps() const; + +private: + // Maps fir.allocmem -> operation after which to insert the fir.alloca + llvm::DenseMap candidateOps; + bool gotError = false; +}; + +/// Converts a fir.allocmem to a fir.alloca +class AllocMemConversion : public mlir::OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + AllocMemConversion( + mlir::MLIRContext *ctx, + const llvm::DenseMap &candidateOps); + + mlir::LogicalResult + matchAndRewrite(fir::AllocMemOp allocmem, + mlir::PatternRewriter &rewriter) const override; + + /// Determine where to insert the alloca operation. The returned value should + /// be checked to see if it is inside a loop + static mlir::Operation *findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc); + +private: + /// allocmem operations that DFA has determined are safe to move to the stack + /// mapping to where to insert replacement freemem operations + const llvm::DenseMap &candidateOps; + + /// Returns the alloca if it was successfully inserted, otherwise {} + std::optional + insertAlloca(fir::AllocMemOp &oldAlloc, + mlir::PatternRewriter &rewriter) const; +}; + +class StackArraysPass : public fir::impl::StackArraysBase { +public: + StackArraysPass() = default; + StackArraysPass(const StackArraysPass &pass); + + llvm::StringRef getDescription() const override; + + void runOnOperation() override; + +private: + Statistic runCount{this, "stackArraysRunCount", + "Number of heap allocations moved to the stack"}; +}; + +} // namespace + +static void print(llvm::raw_ostream &os, AllocationState state) { + switch (state) { + case AllocationState::Unknown: + os << "Unknown"; + break; + case AllocationState::Freed: + os << "Freed"; + break; + case AllocationState::Allocated: + os << "Allocated"; + break; + } +} + +/// Join two AllocationStates for the same value coming from different CFG +/// blocks +static AllocationState join(AllocationState lhs, AllocationState rhs) { + // | Allocated | Freed | Unknown + // ========= | ========= | ========= | ========= + // Allocated | Allocated | Unknown | Allocated + // Freed | Unknown | Freed | Unknown + // Unknown | Allocated | Unknown | Unknown + if (lhs == rhs) + return lhs; + if ((lhs == AllocationState::Unknown && rhs == AllocationState::Allocated) || + (lhs == AllocationState::Allocated && rhs == AllocationState::Unknown)) { + // On some path through this function we do an allocation. This is a + // candidate for a function-scope stack allocation so long as we are sure + // it gets freed by the time we return + return AllocationState::Allocated; + } + return AllocationState::Unknown; +} + +mlir::ChangeResult LatticePoint::join(const AbstractDenseLattice &lattice) { + const auto &rhs = static_cast(lattice); + mlir::ChangeResult changed = mlir::ChangeResult::NoChange; + + // add everything from rhs to map, handling cases where values are in both + for (const auto &[value, rhsState] : rhs.stateMap) { + auto it = stateMap.find(value); + if (it != stateMap.end()) { + // value is present in both maps + AllocationState myState = it->second; + AllocationState newState = ::join(myState, rhsState); + if (newState != myState) { + changed = mlir::ChangeResult::Change; + it->getSecond() = newState; + } + } else { + // value not present in current map: add it + stateMap.insert({value, rhsState}); + changed = mlir::ChangeResult::Change; + } + } + + return changed; +} + +void LatticePoint::print(llvm::raw_ostream &os) const { + for (const auto &[value, state] : stateMap) { + os << value << ": "; + ::print(os, state); + } +} + +mlir::ChangeResult LatticePoint::reset() { + if (stateMap.empty()) + return mlir::ChangeResult::NoChange; + stateMap.clear(); + return mlir::ChangeResult::Change; +} + +mlir::ChangeResult LatticePoint::set(mlir::Value value, AllocationState state) { + auto addInsertionPoint = [this](mlir::Value &value, AllocationState state) { + if (state != AllocationState::Freed) + return; + fir::AllocMemOp allocmem = value.getDefiningOp(); + mlir::Operation *insertionPoint = + AllocMemConversion::findAllocaInsertionPoint(allocmem); + if (insertionPoint) { + // check that we are not going to insert the fir.alloca into a loop + if (insertionPoint->getParentOfType()) + return; + auto [it, inserted] = insertionPoints.insert({value, insertionPoint}); + assert(inserted && "Uniqueness is already checked using stateMap"); + } + }; + + if (stateMap.count(value)) { + // already in map + AllocationState &oldState = stateMap[value]; + if (oldState != state) { + stateMap[value] = state; + addInsertionPoint(value, state); + return mlir::ChangeResult::Change; + } + return mlir::ChangeResult::NoChange; + } + stateMap.insert({value, state}); + addInsertionPoint(value, state); + return mlir::ChangeResult::Change; +} + +/// Get values which were allocated in this function and always freed before +/// the function returns +void LatticePoint::appendFreedValues( + llvm::DenseMap &out) const { + for (auto &[value, insertionPoint] : insertionPoints) { + out.insert({value.getDefiningOp(), insertionPoint}); + } +} + +std::optional LatticePoint::get(mlir::Value val) const { + auto it = stateMap.find(val); + if (it == stateMap.end()) + return {}; + return it->second; +} + +void AllocationAnalysis::visitOperation(mlir::Operation *op, + const LatticePoint &before, + LatticePoint *after) { + LLVM_DEBUG(llvm::dbgs() << "StackArrays: Visiting operation: " << *op + << "\n"); + LLVM_DEBUG(llvm::dbgs() << "--Lattice in: " << before << "\n"); + + // propagate before -> after + mlir::ChangeResult changed = after->join(before); + + if (auto allocmem = mlir::dyn_cast(op)) { + assert(op->getNumResults() == 1 && "fir.allocmem has one result"); + auto attr = op->getAttrOfType( + fir::MustBeHeapAttr::getAttrName()); + if (attr && attr.getValue()) { + LLVM_DEBUG(llvm::dbgs() << "--Found fir.must_be_heap: skipping\n"); + // skip allocation marked not to be moved + return; + } + + auto retTy = allocmem.getAllocatedType(); + if (!retTy.isa()) { + LLVM_DEBUG(llvm::dbgs() + << "--Allocation is not for an array: skipping\n"); + } + + mlir::Value result = op->getResult(0); + changed |= after->set(result, AllocationState::Allocated); + } else if (mlir::isa(op)) { + assert(op->getNumOperands() == 1 && "fir.freemem has one operand"); + mlir::Value operand = op->getOperand(0); + std::optional operandState = before.get(operand); + if (operandState && *operandState == AllocationState::Allocated) { + // don't tag things not allocated in this function as freed, so that we + // don't think they are candidates for moving to the stack + changed |= after->set(operand, AllocationState::Freed); + } + } + + // we pass lattices straight through fir.call because called functions should + // not deallocate flang-generated array temporaries + + LLVM_DEBUG(llvm::dbgs() << "--Lattice out: " << *after << "\n"); + propagateIfChanged(after, changed); +} + +void AllocationAnalysis::setToEntryState(LatticePoint *lattice) { + propagateIfChanged(lattice, lattice->reset()); +} + +/// Mostly a copy of AbstractDenseLattice::visitOperationCFG - the difference +/// being that call operations are passed through to the transfer function +void AllocationAnalysis::visitOperationCFG(mlir::Operation *op) { + // If the containing block is not executable, bail out. + if (!getOrCreateFor(op, op->getBlock())->isLive()) + return; + + // Get the dense lattice to update + mlir::dataflow::AbstractDenseLattice *after = getLattice(op); + + // If this op implements region control-flow, then control-flow dictates its + // transfer function. + if (auto branch = mlir::dyn_cast(op)) + return visitRegionBranchOperation(op, branch, after); + + // pass call operations through to the transfer function + + // Get the dense state before the execution of the op. + const mlir::dataflow::AbstractDenseLattice *before; + if (mlir::Operation *prev = op->getPrevNode()) + before = getLatticeFor(op, prev); + else + before = getLatticeFor(op, op->getBlock()); + + /// Invoke the operation transfer function + visitOperationImpl(op, *before, after); +} + +StackArraysAnalysisWrapper::StackArraysAnalysisWrapper(mlir::Operation *op) { + mlir::func::FuncOp func = mlir::cast(op); + + mlir::DataFlowSolver solver; + solver.load(); // required to mark blocks + // live + solver.load(); + if (failed(solver.initializeAndRun(op))) { + llvm::errs() << "DataFlowSolver failed!"; + gotError = true; + return; + } + + func.walk([&](fir::FreeMemOp child) { + const LatticePoint *lattice = solver.lookupState(child); + // there will be no lattice for an unreachable block + if (lattice) { + lattice->appendFreedValues(candidateOps); + } + }); + + LLVM_DEBUG(for (auto [allocMemOp, _] + : candidateOps) { + llvm::dbgs() << "StackArrays: Found candidate op: " << *allocMemOp << '\n'; + }); +} + +bool StackArraysAnalysisWrapper::hasErrors() const { return gotError; } + +const llvm::DenseMap & +StackArraysAnalysisWrapper::getCandidateOps() const { + return candidateOps; +} + +AllocMemConversion::AllocMemConversion( + mlir::MLIRContext *ctx, + const llvm::DenseMap &candidateOps) + : OpRewritePattern(ctx), candidateOps(candidateOps) {} + +mlir::LogicalResult +AllocMemConversion::matchAndRewrite(fir::AllocMemOp allocmem, + mlir::PatternRewriter &rewriter) const { + auto oldInsertionPt = rewriter.saveInsertionPoint(); + // add alloca operation + std::optional alloca = insertAlloca(allocmem, rewriter); + rewriter.restoreInsertionPoint(oldInsertionPt); + if (!alloca) + return mlir::failure(); + + // remove freemem operations + for (mlir::Operation *user : allocmem.getOperation()->getUsers()) { + if (mlir::isa(user)) { + rewriter.eraseOp(user); + } + } + + // replace references to heap allocation with references to stack allocation + rewriter.replaceAllUsesWith(allocmem.getResult(), alloca->getResult()); + + // remove allocmem operation + rewriter.eraseOp(allocmem.getOperation()); + + return mlir::success(); +} + +mlir::Operation * +AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) { + // Ideally the alloca should be inserted at the end of the function entry + // block so that we do not allocate stack space in a loop. However, + // the operands to the alloca may not be available that early, so insert it + // after the last operand becomes available + LLVM_DEBUG(llvm::dbgs() << "StackArrays: findAllocaInsertionPoint: " + << oldAlloc << "\n"); + + // Find when the last operand value becomes available + mlir::Block *operandsBlock = nullptr; + mlir::Operation *lastOperand = nullptr; + for (mlir::Value operand : oldAlloc.getOperands()) { + LLVM_DEBUG(llvm::dbgs() << "--considering operand " << operand << "\n"); + mlir::Operation *op = operand.getDefiningOp(); + if (!operandsBlock) + operandsBlock = op->getBlock(); + else if (operandsBlock != op->getBlock()) { + LLVM_DEBUG(llvm::dbgs() + << "----operand declared in a different block!\n"); + // Operation::isBeforeInBlock requires the operations to be in the same + // block. The best we can do is the location of the allocmem. + return oldAlloc.getOperation(); + } + if (!lastOperand || lastOperand->isBeforeInBlock(op)) + lastOperand = op; + } + + if (lastOperand) { + // there were value operands to the allocmem so insert after the last one + LLVM_DEBUG(llvm::dbgs() + << "--Placing after last operand: " << *lastOperand << "\n"); + return lastOperand; + } + + // There were no value operands to the allocmem so we are safe to insert it + // at the start of the function entry block + mlir::func::FuncOp func = oldAlloc->getParentOfType(); + assert(func && "This analysis is run on func.func"); + mlir::Block &entryBlock = func.getBlocks().front(); + LLVM_DEBUG(llvm::dbgs() << "--Placing at the start of func entry block\n"); + return &entryBlock.getOperations().front(); +} + +std::optional +AllocMemConversion::insertAlloca(fir::AllocMemOp &oldAlloc, + mlir::PatternRewriter &rewriter) const { + auto it = candidateOps.find(oldAlloc.getOperation()); + if (it == candidateOps.end()) { + return {}; + } + mlir::Operation *insertionPoint = it->second; + if (!insertionPoint) + return {}; + + mlir::Location loc = oldAlloc.getLoc(); + mlir::Type varTy = oldAlloc.getInType(); + auto unpackName = [](std::optional opt) -> llvm::StringRef { + if (opt) + return *opt; + return {}; + }; + llvm::StringRef uniqName = unpackName(oldAlloc.getUniqName()); + llvm::StringRef bindcName = unpackName(oldAlloc.getBindcName()); + + rewriter.setInsertionPointAfter(insertionPoint); + return rewriter.create(loc, varTy, uniqName, bindcName, + oldAlloc.getTypeparams(), + oldAlloc.getShape()); +} + +StackArraysPass::StackArraysPass(const StackArraysPass &pass) + : fir::impl::StackArraysBase(pass) {} + +llvm::StringRef StackArraysPass::getDescription() const { + return "Move heap allocated array temporaries to the stack"; +} + +void StackArraysPass::runOnOperation() { + mlir::Operation *op = getOperation(); + mlir::func::FuncOp func = mlir::cast(op); + + const auto &analysis = getAnalysis(); + if (analysis.hasErrors()) { + signalPassFailure(); + return; + } + + const auto &candidateOps = analysis.getCandidateOps(); + runCount += candidateOps.size(); + + mlir::MLIRContext &context = getContext(); + mlir::RewritePatternSet patterns(&context); + mlir::ConversionTarget target(context); + + target.addLegalDialect(); + target.addDynamicallyLegalOp([&](fir::AllocMemOp alloc) { + return !candidateOps.count(alloc.getOperation()); + }); + + patterns.insert(&context, candidateOps); + if (mlir::failed( + mlir::applyPartialConversion(func, target, std::move(patterns)))) { + mlir::emitError(func.getLoc(), "error in stack arrays optimization\n"); + signalPassFailure(); + } +} + +std::unique_ptr fir::createStackArraysPass() { + return std::make_unique(); +} Index: flang/test/Lower/Intrinsics/c_loc.f90 =================================================================== --- flang/test/Lower/Intrinsics/c_loc.f90 +++ flang/test/Lower/Intrinsics/c_loc.f90 @@ -177,7 +177,7 @@ ! CHECK: %[[VAL_2:.*]] = fir.zero_bits !fir.ptr ! CHECK: fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref> ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> {bindc_name = "ptr", uniq_name = "_QFc_loc_non_save_pointer_scalarEptr"} -! CHECK: %[[VAL_4:.*]] = fir.allocmem i32 {uniq_name = "_QFc_loc_non_save_pointer_scalarEi.alloc"} +! CHECK: %[[VAL_4:.*]] = fir.allocmem i32 {fir.must_be_heap = true, uniq_name = "_QFc_loc_non_save_pointer_scalarEi.alloc"} ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap) -> !fir.ptr ! CHECK: fir.store %[[VAL_5]] to %[[VAL_1]] : !fir.ref> ! CHECK: %[[VAL_6:.*]] = arith.constant 10 : i32 Index: flang/test/Lower/Intrinsics/system_clock.f90 =================================================================== --- flang/test/Lower/Intrinsics/system_clock.f90 +++ flang/test/Lower/Intrinsics/system_clock.f90 @@ -43,7 +43,7 @@ ! CHECK: %[[V_6:[0-9]+]] = fir.alloca i64 {bindc_name = "count_rate_", fir.target, uniq_name = "_QFssEcount_rate_"} ! CHECK: %[[V_7:[0-9]+]] = fir.convert %[[V_6]] : (!fir.ref) -> !fir.ptr ! CHECK: fir.store %[[V_7]] to %[[V_4]] : !fir.ref> - ! CHECK: %[[V_8:[0-9]+]] = fir.allocmem i64 {uniq_name = "_QFssEcount_max.alloc"} + ! CHECK: %[[V_8:[0-9]+]] = fir.allocmem i64 {fir.must_be_heap = true, uniq_name = "_QFssEcount_max.alloc"} ! CHECK: fir.store %[[V_8]] to %[[V_1]] : !fir.ref> ! CHECK: %[[V_9:[0-9]+]] = fir.load %[[V_4]] : !fir.ref> ! CHECK: %[[V_10:[0-9]+]] = fir.load %[[V_1]] : !fir.ref> Index: flang/test/Transforms/stack-arrays.f90 =================================================================== --- /dev/null +++ flang/test/Transforms/stack-arrays.f90 @@ -0,0 +1,122 @@ +! RUN: %flang_fc1 -emit-fir %s -o - | fir-opt --array-value-copy | fir-opt --stack-arrays | FileCheck %s + +! check simple array value copy case +subroutine array_value_copy_simple(arr) + integer, intent(inout) :: arr(4) + arr(3:4) = arr(1:2) +end subroutine +! CHECK-LABEL: func.func @_QParray_value_copy_simple(%arg0: !fir.ref> +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: fir.alloca !fir.array<4xi32> +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: return +! CHECK-NEXT: } + +! check complex array value copy case +module stuff + type DerivedWithAllocatable + integer, dimension(:), allocatable :: dat + end type + + contains + subroutine array_value_copy_complex(arr) + type(DerivedWithAllocatable), intent(inout) :: arr(:) + arr(3:4) = arr(1:2) + end subroutine +end module +! CHECK: func.func +! CHECK-SAME: array_value_copy_complex +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: fir.alloca !fir.array +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: return +! CHECK-NEXT: } + +subroutine test_vector_subscripted_section_to_box(v, x) + interface + subroutine takes_box(y) + real :: y(:) + end subroutine + end interface + + integer :: v(:) + real :: x(:) + call takes_box(x(v)) +end subroutine +! CHECK: func.func +! CHECK-SAME: test_vector_subscripted_section_to_box +! CHECK-NOT: fir.allocmem +! CHECK: fir.alloca !fir.array +! CHECK-NOT: fir.allocmem +! CHECK: fir.call @_QPtakes_box +! CHECK-NOT: fir.freemem +! CHECK: return +! CHECK-NEXT: } + +subroutine call_parenthesized_arg(x) + integer :: x(100) + call bar((x)) +end subroutine +! CHECK: func.func +! CHECK-SAME: call_parenthesized_arg +! CHECK-NOT: fir.allocmem +! CHECK: fir.alloca !fir.array<100xi32> +! CHECK-NOT: fir.allocmem +! CHECK: fir.call @_QPbar +! CHECK-NOT: fir.freemem +! CHECK: return +! CHECK-NEXT: } + +subroutine where_allocatable_assignments(a, b) + integer :: a(:) + integer, allocatable :: b(:) + where(b > 0) + b = a + elsewhere + b(:) = 0 + end where +end subroutine +! TODO: broken: passing allocation through fir.result +! CHECK: func.func +! CHECK-SAME: where_allocatable_assignments +! CHECK: return +! CHECK-NEXT: } + +subroutine array_constructor(a, b) + real :: a(5), b + real, external :: f + a = [f(b), f(b+1), f(b+2), f(b+5), f(b+11)] +end subroutine +! TODO: broken: realloc +! CHECK: func.func +! CHECK-SAME: array_constructor +! CHECK: return +! CHECK-NEXT: } + +subroutine sequence(seq, n) + integer :: n, seq(n) + seq = [(i,i=1,n)] +end subroutine +! TODO: broken: realloc +! CHECK: func.func +! CHECK-SAME: sequence +! CHECK: return +! CHECK-NEXT: } Index: flang/test/Transforms/stack-arrays.fir =================================================================== --- /dev/null +++ flang/test/Transforms/stack-arrays.fir @@ -0,0 +1,136 @@ +// RUN: fir-opt --stack-arrays %s | FileCheck %s + +// Simplest transformation +func.func @simple() { + %0 = fir.allocmem !fir.array<42xi32> + fir.freemem %0 : !fir.heap> + return +} +// CHECK: func.func @simple() { +// CHECK-NEXT: fir.alloca !fir.array<42xi32> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// Check fir.must_be_heap allocations are not moved +func.func @must_be_heap() { + %0 = fir.allocmem !fir.array<42xi32> {fir.must_be_heap = true} + fir.freemem %0 : !fir.heap> + return +} +// CHECK: func.func @must_be_heap() { +// CHECK-NEXT: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> {fir.must_be_heap = true} +// CHECK-NEXT: fir.freemem %[[ALLOC]] : !fir.heap> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// Check the data-flow-analysis can detect cases where we aren't sure if memory +// is freed by the end of the function +func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) { + %7 = arith.constant 42 : index + %8 = fir.allocmem !fir.array, %7 {uniq_name = "_QFdfa1Earr.alloc"} + %9 = fir.load %arg0 : !fir.ref> + %10 = fir.convert %9 : (!fir.logical<4>) -> i1 + fir.if %10 { + fir.freemem %8 : !fir.heap> + } else { + } + return +} +// CHECK: func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) { +// CHECK-NEXT: %c42 = arith.constant 42 : index +// CHECK-NEXT: %0 = fir.allocmem !fir.array, %c42 {uniq_name = "_QFdfa1Earr.alloc"} +// CHECK-NEXT: %1 = fir.load %arg0 : !fir.ref> +// CHECK-NEXT: %2 = fir.convert %1 : (!fir.logical<4>) -> i1 +// CHECK-NEXT: fir.if %2 { +// CHECK-NEXT: fir.freemem %0 : !fir.heap> +// CHECK-NEXT: } else { +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } + +// Check data-flow-analysis allows cases where it can't tell if something is +// allocated but it is definately freed +func.func @dfa2(%arg0: !fir.ref> {fir.bindc_name = "cond"}) { + %c42 = arith.constant 42 : index + %1 = fir.alloca !fir.heap> + %5 = fir.load %arg0 : !fir.ref> + %6 = fir.convert %5 : (!fir.logical<4>) -> i1 + fir.if %6 { + %12 = fir.allocmem !fir.array, %c42 + fir.store %12 to %1 : !fir.ref>> + } else { + } + %7 = fir.load %1 : !fir.ref>> + fir.freemem %7 : !fir.heap> + return +} +// TODO: the pass needs to track loads & stores for this case + +// check the alloca is placed after all operands become available +func.func @placement1() { + // do some stuff with other ssa values + %1 = arith.constant 1 : index + %2 = arith.constant 2 : index + %3 = arith.addi %1, %2 : index + // operand is now available + %4 = fir.allocmem !fir.array, %3 + // ... + fir.freemem %4 : !fir.heap> + return +} +// CHECK: func.func @placement1() { +// CHECK-NEXT: %[[ONE:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[TWO:.*]] = arith.constant 2 : index +// CHECK-NEXT: %[[ARG:.*]] = arith.addi %[[ONE]], %[[TWO]] : index +// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[ARG]] +// CHECK-NEXT: return +// CHECK-NEXT: } + +// check that if there are no operands, then the alloca is placed early +func.func @placement2() { + // do some stuff with other ssa values + %1 = arith.constant 1 : index + %2 = arith.constant 2 : index + %3 = arith.addi %1, %2 : index + %4 = fir.allocmem !fir.array<42xi32> + // ... + fir.freemem %4 : !fir.heap> + return +} +// CHECK: func.func @placement2() { +// CHECK-NEXT: %[[ONE:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> +// CHECK-NEXT: %[[TWO:.*]] = arith.constant 2 : index +// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index +// CHECK-NEXT: return +// CHECK-NEXT: } + +// check that stack allocations are not placed in loops +func.func @placement3() { + %c1 = arith.constant 1 : index + %c1_i32 = fir.convert %c1 : (index) -> i32 + %c2 = arith.constant 2 : index + %c10 = arith.constant 10 : index + %0:2 = fir.do_loop %arg0 = %c1 to %c10 step %c1 iter_args(%arg1 = %c1_i32) -> (index, i32) { + %3 = arith.addi %c1, %c2 : index + // operand is now available + %4 = fir.allocmem !fir.array, %3 + // ... + fir.freemem %4 : !fir.heap> + fir.result %3, %c1_i32 : index, i32 + } + return +} +// CHECK: func.func @placement3() { +// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 +// CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index +// CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : index +// CHECK-NEXT: fir.do_loop +// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index +// CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array, %[[SUM]] +// CHECK-NEXT: fir.freemem %[[MEM]] : !fir.heap> +// CHECK-NEXT: fir.result +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } Index: mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h =================================================================== --- mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h +++ mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h @@ -93,16 +93,11 @@ propagateIfChanged(lhs, lhs->join(rhs)); } -private: /// Visit an operation. If this is a call operation or region control-flow /// operation, then the state after the execution of the operation is set by /// control-flow or the callgraph. Otherwise, this function invokes the /// operation transfer function. - void visitOperation(Operation *op); - - /// Visit a block. The state at the start of the block is propagated from - /// control-flow predecessors or callsites - void visitBlock(Block *block); + virtual void visitOperationCFG(Operation *op); /// Visit a program point within a region branch operation with predecessors /// in it. This can either be an entry block of one of the regions of the @@ -110,6 +105,11 @@ void visitRegionBranchOperation(ProgramPoint point, RegionBranchOpInterface branch, AbstractDenseLattice *after); + +private: + /// Visit a block. The state at the start of the block is propagated from + /// control-flow predecessors or callsites + void visitBlock(Block *block); }; //===----------------------------------------------------------------------===// @@ -148,7 +148,6 @@ setToEntryState(static_cast(lattice)); } -private: /// Type-erased wrappers that convert the abstract dense lattice to a derived /// lattice and invoke the virtual hooks operating on the derived lattice. void visitOperationImpl(Operation *op, const AbstractDenseLattice &before, Index: mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp =================================================================== --- mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp +++ mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp @@ -20,7 +20,7 @@ LogicalResult AbstractDenseDataFlowAnalysis::initialize(Operation *top) { // Visit every operation and block. - visitOperation(top); + visitOperationCFG(top); for (Region ®ion : top->getRegions()) { for (Block &block : region) { visitBlock(&block); @@ -34,7 +34,7 @@ LogicalResult AbstractDenseDataFlowAnalysis::visit(ProgramPoint point) { if (auto *op = point.dyn_cast()) - visitOperation(op); + visitOperationCFG(op); else if (auto *block = point.dyn_cast()) visitBlock(block); else @@ -42,7 +42,7 @@ return success(); } -void AbstractDenseDataFlowAnalysis::visitOperation(Operation *op) { +void AbstractDenseDataFlowAnalysis::visitOperationCFG(Operation *op) { // If the containing block is not executable, bail out. if (!getOrCreateFor(op, op->getBlock())->isLive()) return;