diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt --- a/clang/docs/tools/clang-formatted-files.txt +++ b/clang/docs/tools/clang-formatted-files.txt @@ -2299,6 +2299,7 @@ flang/lib/Optimizer/Transforms/MemRefDataFlowOpt.cpp flang/lib/Optimizer/Transforms/PassDetail.h flang/lib/Optimizer/Transforms/RewriteLoop.cpp +flang/lib/Optimizer/Transforms/StackArrays.cpp flang/lib/Parser/basic-parsers.h flang/lib/Parser/char-block.cpp flang/lib/Parser/char-buffer.cpp diff --git a/flang/include/flang/Optimizer/Builder/MutableBox.h b/flang/include/flang/Optimizer/Builder/MutableBox.h --- a/flang/include/flang/Optimizer/Builder/MutableBox.h +++ b/flang/include/flang/Optimizer/Builder/MutableBox.h @@ -127,8 +127,8 @@ void genInlinedAllocation(fir::FirOpBuilder &builder, mlir::Location loc, const fir::MutableBoxValue &box, mlir::ValueRange lbounds, mlir::ValueRange extents, - mlir::ValueRange lenParams, - llvm::StringRef allocName); + mlir::ValueRange lenParams, llvm::StringRef allocName, + bool mustBeHeap = false); void genInlinedDeallocate(fir::FirOpBuilder &builder, mlir::Location loc, const fir::MutableBoxValue &box); diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.h b/flang/include/flang/Optimizer/Dialect/FIRAttr.h --- a/flang/include/flang/Optimizer/Dialect/FIRAttr.h +++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.h @@ -57,6 +57,15 @@ mlir::Type getType() const; }; +/// Attribute which can be applied to a fir.allocmem operation, specifying that +/// the allocation may not be moved to the heap by passes +class MustBeHeapAttr : public mlir::BoolAttr { +public: + using BoolAttr::BoolAttr; + + static constexpr llvm::StringRef getAttrName() { return "fir.must_be_heap"; } +}; + // Attributes for building SELECT CASE multiway branches /// A closed interval (including the bound values) is an interval with both an diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -55,6 +55,7 @@ std::unique_ptr createMemDataFlowOptPass(); std::unique_ptr createPromoteToAffinePass(); std::unique_ptr createMemoryAllocationPass(); +std::unique_ptr createStackArraysPass(); std::unique_ptr createSimplifyIntrinsicsPass(); std::unique_ptr createAddDebugFoundationPass(); diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -235,6 +235,16 @@ let constructor = "::fir::createMemoryAllocationPass()"; } +def StackArrays : Pass<"stack-arrays", "mlir::ModuleOp"> { + let summary = "Move local array allocations from heap memory into stack memory"; + let description = [{ + Convert heap allocations for arrays, even those of unknown size, into stack + allocations. + }]; + let dependentDialects = [ "fir::FIROpsDialect" ]; + let constructor = "::fir::createStackArraysPass()"; +} + def SimplifyRegionLite : Pass<"simplify-region-lite", "mlir::ModuleOp"> { let summary = "Region simplification"; let description = [{ diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp --- a/flang/lib/Lower/Allocatable.cpp +++ b/flang/lib/Lower/Allocatable.cpp @@ -424,7 +424,8 @@ } } fir::factory::genInlinedAllocation(builder, loc, box, lbounds, extents, - lenParams, mangleAlloc(alloc)); + lenParams, mangleAlloc(alloc), + /*mustBeHeap=*/true); } void genSimpleAllocation(const Allocation &alloc, diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp --- a/flang/lib/Optimizer/Builder/MutableBox.cpp +++ b/flang/lib/Optimizer/Builder/MutableBox.cpp @@ -16,6 +16,7 @@ #include "flang/Optimizer/Builder/Runtime/Derived.h" #include "flang/Optimizer/Builder/Runtime/Stop.h" #include "flang/Optimizer/Builder/Todo.h" +#include "flang/Optimizer/Dialect/FIRAttr.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" #include "flang/Optimizer/Support/FatalError.h" @@ -712,13 +713,11 @@ return newStorage; } -void fir::factory::genInlinedAllocation(fir::FirOpBuilder &builder, - mlir::Location loc, - const fir::MutableBoxValue &box, - mlir::ValueRange lbounds, - mlir::ValueRange extents, - mlir::ValueRange lenParams, - llvm::StringRef allocName) { +void fir::factory::genInlinedAllocation( + fir::FirOpBuilder &builder, mlir::Location loc, + const fir::MutableBoxValue &box, mlir::ValueRange lbounds, + mlir::ValueRange extents, mlir::ValueRange lenParams, + llvm::StringRef allocName, bool mustBeHeap) { auto lengths = getNewLengths(builder, loc, box, lenParams); llvm::SmallVector safeExtents; for (mlir::Value extent : extents) @@ -735,6 +734,9 @@ mlir::Value irBox = fir::factory::getMutableIRBox(builder, loc, box); fir::runtime::genDerivedTypeInitialize(builder, loc, irBox); } + + heap->setAttr(fir::MustBeHeapAttr::getAttrName(), + fir::MustBeHeapAttr::get(builder.getContext(), mustBeHeap)); } void fir::factory::genInlinedDeallocate(fir::FirOpBuilder &builder, diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -8,6 +8,7 @@ ArrayValueCopy.cpp ExternalNameConversion.cpp MemoryAllocation.cpp + StackArrays.cpp MemRefDataFlowOpt.cpp SimplifyRegionLite.cpp AlgebraicSimplification.cpp diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp new file mode 100644 --- /dev/null +++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp @@ -0,0 +1,736 @@ +//===- StackArrays.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/LowLevelIntrinsics.h" +#include "flang/Optimizer/Dialect/FIRAttr.h" +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Support/FIRContext.h" +#include "flang/Optimizer/Transforms/Passes.h" +#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h" +#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" +#include "mlir/Analysis/DataFlow/DenseAnalysis.h" +#include "mlir/Analysis/DataFlowFramework.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Diagnostics.h" +#include "mlir/IR/Value.h" +#include "mlir/Interfaces/LoopLikeInterface.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/Passes.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PointerUnion.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" +#include + +namespace fir { +#define GEN_PASS_DEF_STACKARRAYS +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir + +#define DEBUG_TYPE "stack-arrays" + +namespace { + +/// The state of an SSA value at each program point +enum class AllocationState { + /// This means that the allocation state of a variable cannot be determined + /// at this program point, e.g. because one route through a conditional freed + /// the variable and the other route didn't. + /// This asserts a known-unknown: different from the unknown-unknown of having + /// no AllocationState stored for a particular SSA value + Unknown, + /// Means this SSA value was allocated on the heap in this function and has + /// now been freed + Freed, + /// Means this SSA value was allocated on the heap in this function and is a + /// candidate for moving to the stack + Allocated, +}; + +/// Stores where an alloca should be inserted. If the PointerUnion is an +/// Operation the alloca should be inserted /after/ the operation. If it is a +/// block, the alloca can be placed anywhere in that block. +class InsertionPoint { + llvm::PointerUnion location; + bool saveRestoreStack; + + /// Get contained pointer type or nullptr + template + T *tryGetPtr() const { + if (location.is()) + return location.get(); + return nullptr; + } + +public: + template + InsertionPoint(T *ptr, bool saveRestoreStack = false) + : location(ptr), saveRestoreStack{saveRestoreStack} {} + InsertionPoint(std::nullptr_t null) + : location(null), saveRestoreStack{false} {} + + /// Get contained operation, or nullptr + mlir::Operation *tryGetOperation() const { + return tryGetPtr(); + } + + /// Get contained block, or nullptr + mlir::Block *tryGetBlock() const { return tryGetPtr(); } + + /// Get whether the stack should be saved/restored. If yes, an llvm.stacksave + /// intrinsic should be added before the alloca, and an llvm.stackrestore + /// intrinsic should be added where the freemem is + bool shouldSaveRestoreStack() const { return saveRestoreStack; } + + operator bool() const { return tryGetOperation() || tryGetBlock(); } + + bool operator==(const InsertionPoint &rhs) const { + return (location == rhs.location) && + (saveRestoreStack == rhs.saveRestoreStack); + } + + bool operator!=(const InsertionPoint &rhs) const { return !(*this == rhs); } +}; + +/// Maps SSA values to their AllocationState at a particular program point. +/// Also caches the insertion points for the new alloca operations +class LatticePoint : public mlir::dataflow::AbstractDenseLattice { + // Maps all values we are interested in to states + llvm::SmallDenseMap stateMap; + +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LatticePoint) + using AbstractDenseLattice::AbstractDenseLattice; + + bool operator==(const LatticePoint &rhs) const { + return stateMap == rhs.stateMap; + } + + /// Join the lattice accross control-flow edges + mlir::ChangeResult join(const AbstractDenseLattice &lattice) override; + + void print(llvm::raw_ostream &os) const override; + + /// Clear all modifications + mlir::ChangeResult reset(); + + /// Set the state of an SSA value + mlir::ChangeResult set(mlir::Value value, AllocationState state); + + /// Get fir.allocmem ops which were allocated in this function and always + /// freed before the function returns, plus whre to insert replacement + /// fir.alloca ops + void appendFreedValues(llvm::DenseSet &out) const; + + std::optional get(mlir::Value val) const; +}; + +class AllocationAnalysis + : public mlir::dataflow::DenseDataFlowAnalysis { +public: + using DenseDataFlowAnalysis::DenseDataFlowAnalysis; + + void visitOperation(mlir::Operation *op, const LatticePoint &before, + LatticePoint *after) override; + + /// At an entry point, the last modifications of all memory resources are + /// yet to be determined + void setToEntryState(LatticePoint *lattice) override; + +protected: + /// Visit control flow operations and decide whether to call visitOperation + /// to apply the transfer function + void processOperation(mlir::Operation *op) override; +}; + +/// Drives analysis to find candidate fir.allocmem operations which could be +/// moved to the stack. Intended to be used with mlir::Pass::getAnalysis +class StackArraysAnalysisWrapper { +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(StackArraysAnalysisWrapper) + + // Maps fir.allocmem -> place to insert alloca + using AllocMemMap = llvm::DenseMap; + + StackArraysAnalysisWrapper(mlir::Operation *op) {} + + bool hasErrors() const; + + const AllocMemMap &getCandidateOps(mlir::Operation *func); + +private: + llvm::DenseMap funcMaps; + bool gotError = false; + + void analyseFunction(mlir::Operation *func); +}; + +/// Converts a fir.allocmem to a fir.alloca +class AllocMemConversion : public mlir::OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + AllocMemConversion( + mlir::MLIRContext *ctx, + const llvm::DenseMap &candidateOps); + + mlir::LogicalResult + matchAndRewrite(fir::AllocMemOp allocmem, + mlir::PatternRewriter &rewriter) const override; + + /// Determine where to insert the alloca operation. The returned value should + /// be checked to see if it is inside a loop + static InsertionPoint findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc); + +private: + /// allocmem operations that DFA has determined are safe to move to the stack + /// mapping to where to insert replacement freemem operations + const llvm::DenseMap &candidateOps; + + /// If we failed to find an insertion point not inside a loop, see if it would + /// be safe to use an llvm.stacksave/llvm.stackrestore inside the loop + static InsertionPoint findAllocaLoopInsertionPoint(fir::AllocMemOp &oldAlloc); + + /// Returns the alloca if it was successfully inserted, otherwise {} + std::optional + insertAlloca(fir::AllocMemOp &oldAlloc, + mlir::PatternRewriter &rewriter) const; + + /// Inserts a stacksave before oldAlloc and a stackrestore after each freemem + void insertStackSaveRestore(fir::AllocMemOp &oldAlloc, + mlir::PatternRewriter &rewriter) const; +}; + +class StackArraysPass : public fir::impl::StackArraysBase { +public: + StackArraysPass() = default; + StackArraysPass(const StackArraysPass &pass); + + llvm::StringRef getDescription() const override; + + void runOnOperation() override; + void runOnFunc(mlir::Operation *func); + +private: + Statistic runCount{this, "stackArraysRunCount", + "Number of heap allocations moved to the stack"}; +}; + +} // namespace + +static void print(llvm::raw_ostream &os, AllocationState state) { + switch (state) { + case AllocationState::Unknown: + os << "Unknown"; + break; + case AllocationState::Freed: + os << "Freed"; + break; + case AllocationState::Allocated: + os << "Allocated"; + break; + } +} + +/// Join two AllocationStates for the same value coming from different CFG +/// blocks +static AllocationState join(AllocationState lhs, AllocationState rhs) { + // | Allocated | Freed | Unknown + // ========= | ========= | ========= | ========= + // Allocated | Allocated | Unknown | Unknown + // Freed | Unknown | Freed | Unknown + // Unknown | Unknown | Unknown | Unknown + if (lhs == rhs) + return lhs; + return AllocationState::Unknown; +} + +mlir::ChangeResult LatticePoint::join(const AbstractDenseLattice &lattice) { + const auto &rhs = static_cast(lattice); + mlir::ChangeResult changed = mlir::ChangeResult::NoChange; + + // add everything from rhs to map, handling cases where values are in both + for (const auto &[value, rhsState] : rhs.stateMap) { + auto it = stateMap.find(value); + if (it != stateMap.end()) { + // value is present in both maps + AllocationState myState = it->second; + AllocationState newState = ::join(myState, rhsState); + if (newState != myState) { + changed = mlir::ChangeResult::Change; + it->getSecond() = newState; + } + } else { + // value not present in current map: add it + stateMap.insert({value, rhsState}); + changed = mlir::ChangeResult::Change; + } + } + + return changed; +} + +void LatticePoint::print(llvm::raw_ostream &os) const { + for (const auto &[value, state] : stateMap) { + os << value << ": "; + ::print(os, state); + } +} + +mlir::ChangeResult LatticePoint::reset() { + if (stateMap.empty()) + return mlir::ChangeResult::NoChange; + stateMap.clear(); + return mlir::ChangeResult::Change; +} + +mlir::ChangeResult LatticePoint::set(mlir::Value value, AllocationState state) { + if (stateMap.count(value)) { + // already in map + AllocationState &oldState = stateMap[value]; + if (oldState != state) { + stateMap[value] = state; + return mlir::ChangeResult::Change; + } + return mlir::ChangeResult::NoChange; + } + stateMap.insert({value, state}); + return mlir::ChangeResult::Change; +} + +/// Get values which were allocated in this function and always freed before +/// the function returns +void LatticePoint::appendFreedValues(llvm::DenseSet &out) const { + for (auto &[value, state] : stateMap) { + if (state == AllocationState::Freed) + out.insert(value); + } +} + +std::optional LatticePoint::get(mlir::Value val) const { + auto it = stateMap.find(val); + if (it == stateMap.end()) + return {}; + return it->second; +} + +void AllocationAnalysis::visitOperation(mlir::Operation *op, + const LatticePoint &before, + LatticePoint *after) { + LLVM_DEBUG(llvm::dbgs() << "StackArrays: Visiting operation: " << *op + << "\n"); + LLVM_DEBUG(llvm::dbgs() << "--Lattice in: " << before << "\n"); + + // propagate before -> after + mlir::ChangeResult changed = after->join(before); + + if (auto allocmem = mlir::dyn_cast(op)) { + assert(op->getNumResults() == 1 && "fir.allocmem has one result"); + auto attr = op->getAttrOfType( + fir::MustBeHeapAttr::getAttrName()); + if (attr && attr.getValue()) { + LLVM_DEBUG(llvm::dbgs() << "--Found fir.must_be_heap: skipping\n"); + // skip allocation marked not to be moved + return; + } + + auto retTy = allocmem.getAllocatedType(); + if (!retTy.isa()) { + LLVM_DEBUG(llvm::dbgs() + << "--Allocation is not for an array: skipping\n"); + return; + } + + mlir::Value result = op->getResult(0); + changed |= after->set(result, AllocationState::Allocated); + } else if (mlir::isa(op)) { + assert(op->getNumOperands() == 1 && "fir.freemem has one operand"); + mlir::Value operand = op->getOperand(0); + std::optional operandState = before.get(operand); + if (operandState && *operandState == AllocationState::Allocated) { + // don't tag things not allocated in this function as freed, so that we + // don't think they are candidates for moving to the stack + changed |= after->set(operand, AllocationState::Freed); + } + } else if (mlir::isa(op)) { + mlir::Operation *parent = op->getParentOp(); + LatticePoint *parentLattice = getLattice(parent); + assert(parentLattice); + mlir::ChangeResult parentChanged = parentLattice->join(*after); + propagateIfChanged(parentLattice, parentChanged); + } + + // we pass lattices straight through fir.call because called functions should + // not deallocate flang-generated array temporaries + + LLVM_DEBUG(llvm::dbgs() << "--Lattice out: " << *after << "\n"); + propagateIfChanged(after, changed); +} + +void AllocationAnalysis::setToEntryState(LatticePoint *lattice) { + propagateIfChanged(lattice, lattice->reset()); +} + +/// Mostly a copy of AbstractDenseLattice::processOperation - the difference +/// being that call operations are passed through to the transfer function +void AllocationAnalysis::processOperation(mlir::Operation *op) { + // If the containing block is not executable, bail out. + if (!getOrCreateFor(op, op->getBlock())->isLive()) + return; + + // Get the dense lattice to update + mlir::dataflow::AbstractDenseLattice *after = getLattice(op); + + // If this op implements region control-flow, then control-flow dictates its + // transfer function. + if (auto branch = mlir::dyn_cast(op)) + return visitRegionBranchOperation(op, branch, after); + + // pass call operations through to the transfer function + + // Get the dense state before the execution of the op. + const mlir::dataflow::AbstractDenseLattice *before; + if (mlir::Operation *prev = op->getPrevNode()) + before = getLatticeFor(op, prev); + else + before = getLatticeFor(op, op->getBlock()); + + /// Invoke the operation transfer function + visitOperationImpl(op, *before, after); +} + +void StackArraysAnalysisWrapper::analyseFunction(mlir::Operation *func) { + assert(mlir::isa(func)); + mlir::DataFlowSolver solver; + // constant propagation is required for dead code analysis, dead code analysis + // is required to mark blocks live (required for mlir dense dfa) + solver.load(); + solver.load(); + + auto [it, inserted] = funcMaps.try_emplace(func); + AllocMemMap &candidateOps = it->second; + + solver.load(); + if (failed(solver.initializeAndRun(func))) { + llvm::errs() << "DataFlowSolver failed!"; + gotError = true; + return; + } + + llvm::DenseSet freedValues; + func->walk([&](mlir::func::ReturnOp child) { + const LatticePoint *lattice = solver.lookupState(child); + // there will be no lattice for an unreachable block + if (lattice) { + lattice->appendFreedValues(freedValues); + } + }); + + for (mlir::Value freedValue : freedValues) { + fir::AllocMemOp allocmem = freedValue.getDefiningOp(); + InsertionPoint insertionPoint = + AllocMemConversion::findAllocaInsertionPoint(allocmem); + if (insertionPoint) + candidateOps.insert({allocmem, insertionPoint}); + } + + LLVM_DEBUG(for (auto [allocMemOp, _] + : candidateOps) { + llvm::dbgs() << "StackArrays: Found candidate op: " << *allocMemOp << '\n'; + }); +} + +bool StackArraysAnalysisWrapper::hasErrors() const { return gotError; } + +const StackArraysAnalysisWrapper::AllocMemMap & +StackArraysAnalysisWrapper::getCandidateOps(mlir::Operation *func) { + if (!funcMaps.count(func)) + analyseFunction(func); + return funcMaps[func]; +} + +AllocMemConversion::AllocMemConversion( + mlir::MLIRContext *ctx, + const llvm::DenseMap &candidateOps) + : OpRewritePattern(ctx), candidateOps(candidateOps) {} + +mlir::LogicalResult +AllocMemConversion::matchAndRewrite(fir::AllocMemOp allocmem, + mlir::PatternRewriter &rewriter) const { + auto oldInsertionPt = rewriter.saveInsertionPoint(); + // add alloca operation + std::optional alloca = insertAlloca(allocmem, rewriter); + rewriter.restoreInsertionPoint(oldInsertionPt); + if (!alloca) + return mlir::failure(); + + // remove freemem operations + for (mlir::Operation *user : allocmem.getOperation()->getUsers()) { + if (mlir::isa(user)) { + rewriter.eraseOp(user); + } + } + + // replace references to heap allocation with references to stack allocation + rewriter.replaceAllUsesWith(allocmem.getResult(), alloca->getResult()); + + // remove allocmem operation + rewriter.eraseOp(allocmem.getOperation()); + + return mlir::success(); +} + +static bool isInLoop(mlir::Block *block) { return mlir::blockIsInLoop(block); } + +static bool isInLoop(mlir::Operation *op) { + return isInLoop(op->getBlock()) || + op->getParentOfType(); +} + +InsertionPoint +AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) { + // Ideally the alloca should be inserted at the end of the function entry + // block so that we do not allocate stack space in a loop. However, + // the operands to the alloca may not be available that early, so insert it + // after the last operand becomes available + // If the old allocmem op was in an openmp region then it should not be moved + // outside of that + LLVM_DEBUG(llvm::dbgs() << "StackArrays: findAllocaInsertionPoint: " + << oldAlloc << "\n"); + + // check that an Operation or Block we are about to return is not in a loop + auto checkReturn = [&](auto *point) -> InsertionPoint { + if (isInLoop(point)) { + mlir::Operation *oldAllocOp = oldAlloc.getOperation(); + if (isInLoop(oldAllocOp)) { + // where we want to put it is in a loop, and even the old location is in + // a loop. Give up. + return findAllocaLoopInsertionPoint(oldAlloc); + } + return {oldAllocOp}; + } + return {point}; + }; + + auto oldOmpRegion = + oldAlloc->getParentOfType(); + + // Find when the last operand value becomes available + mlir::Block *operandsBlock = nullptr; + mlir::Operation *lastOperand = nullptr; + for (mlir::Value operand : oldAlloc.getOperands()) { + LLVM_DEBUG(llvm::dbgs() << "--considering operand " << operand << "\n"); + mlir::Operation *op = operand.getDefiningOp(); + if (!operandsBlock) + operandsBlock = op->getBlock(); + else if (operandsBlock != op->getBlock()) { + LLVM_DEBUG(llvm::dbgs() + << "----operand declared in a different block!\n"); + // Operation::isBeforeInBlock requires the operations to be in the same + // block. The best we can do is the location of the allocmem. + return checkReturn(oldAlloc.getOperation()); + } + if (!lastOperand || lastOperand->isBeforeInBlock(op)) + lastOperand = op; + } + + if (lastOperand) { + // there were value operands to the allocmem so insert after the last one + LLVM_DEBUG(llvm::dbgs() + << "--Placing after last operand: " << *lastOperand << "\n"); + // check we aren't moving out of an omp region + auto lastOpOmpRegion = + lastOperand->getParentOfType(); + if (lastOpOmpRegion == oldOmpRegion) + return checkReturn(lastOperand); + // Presumably this happened because the operands became ready before the + // start of this openmp region. (lastOpOmpRegion != oldOmpRegion) should + // imply that oldOmpRegion comes after lastOpOmpRegion. + return checkReturn(oldOmpRegion.getAllocaBlock()); + } + + // There were no value operands to the allocmem so we are safe to insert it + // as early as we want + + // handle openmp case + if (oldOmpRegion) { + return checkReturn(oldOmpRegion.getAllocaBlock()); + } + + // fall back to the function entry block + mlir::func::FuncOp func = oldAlloc->getParentOfType(); + assert(func && "This analysis is run on func.func"); + mlir::Block &entryBlock = func.getBlocks().front(); + LLVM_DEBUG(llvm::dbgs() << "--Placing at the start of func entry block\n"); + return checkReturn(&entryBlock); +} + +InsertionPoint +AllocMemConversion::findAllocaLoopInsertionPoint(fir::AllocMemOp &oldAlloc) { + mlir::Operation *oldAllocOp = oldAlloc; + // This is only called as a last resort. We should try to insert at the + // location of the old allocation, which is inside of a loop, using + // llvm.stacksave/llvm.stackrestore + + // find freemem ops + llvm::SmallVector freeOps; + for (mlir::Operation *user : oldAllocOp->getUsers()) { + if (mlir::isa(user)) { + freeOps.push_back(user); + } + } + assert(freeOps.size() && "DFA should only return freed memory"); + + // Don't attempt to reason about a stacksave/stackrestore between different + // blocks + for (mlir::Operation *free : freeOps) + if (free->getBlock() != oldAllocOp->getBlock()) + return {nullptr}; + + // Check that there aren't any other stack allocations in between the + // stack save and stack restore + // note: for flang generated temporaries there should only be one free op + for (mlir::Operation *free : freeOps) { + for (mlir::Operation *op = oldAlloc; op && op != free; + op = op->getNextNode()) { + if (mlir::isa(op)) + return {nullptr}; + } + } + + return InsertionPoint{oldAllocOp, /*shouldStackSaveRestore=*/true}; +} + +std::optional +AllocMemConversion::insertAlloca(fir::AllocMemOp &oldAlloc, + mlir::PatternRewriter &rewriter) const { + auto it = candidateOps.find(oldAlloc.getOperation()); + if (it == candidateOps.end()) { + return {}; + } + InsertionPoint insertionPoint = it->second; + if (!insertionPoint) + return {}; + + if (insertionPoint.shouldSaveRestoreStack()) + insertStackSaveRestore(oldAlloc, rewriter); + + mlir::Location loc = oldAlloc.getLoc(); + mlir::Type varTy = oldAlloc.getInType(); + auto unpackName = [](std::optional opt) -> llvm::StringRef { + if (opt) + return *opt; + return {}; + }; + llvm::StringRef uniqName = unpackName(oldAlloc.getUniqName()); + llvm::StringRef bindcName = unpackName(oldAlloc.getBindcName()); + + if (mlir::Operation *op = insertionPoint.tryGetOperation()) + rewriter.setInsertionPointAfter(op); + else { + mlir::Block *block = insertionPoint.tryGetBlock(); + assert(block && "There must be a valid insertion point"); + rewriter.setInsertionPointToStart(block); + } + + return rewriter.create(loc, varTy, uniqName, bindcName, + oldAlloc.getTypeparams(), + oldAlloc.getShape()); +} + +void AllocMemConversion::insertStackSaveRestore( + fir::AllocMemOp &oldAlloc, mlir::PatternRewriter &rewriter) const { + auto oldPoint = rewriter.saveInsertionPoint(); + auto mod = oldAlloc->getParentOfType(); + fir::KindMapping kindMap = fir::getKindMapping(mod); + fir::FirOpBuilder builder{rewriter, kindMap}; + + mlir::func::FuncOp stackSaveFn = fir::factory::getLlvmStackSave(builder); + mlir::SymbolRefAttr stackSaveSym = + builder.getSymbolRefAttr(stackSaveFn.getName()); + + builder.setInsertionPoint(oldAlloc); + mlir::Value sp = + builder + .create(oldAlloc.getLoc(), + stackSaveFn.getFunctionType().getResults(), + stackSaveSym, mlir::ValueRange{}) + .getResult(0); + + mlir::func::FuncOp stackRestoreFn = + fir::factory::getLlvmStackRestore(builder); + mlir::SymbolRefAttr stackRestoreSym = + builder.getSymbolRefAttr(stackRestoreFn.getName()); + + for (mlir::Operation *user : oldAlloc->getUsers()) { + if (mlir::isa(user)) { + builder.setInsertionPoint(user); + builder.create(user->getLoc(), + stackRestoreFn.getFunctionType().getResults(), + stackRestoreSym, mlir::ValueRange{sp}); + } + } + + rewriter.restoreInsertionPoint(oldPoint); +} + +StackArraysPass::StackArraysPass(const StackArraysPass &pass) + : fir::impl::StackArraysBase(pass) {} + +llvm::StringRef StackArraysPass::getDescription() const { + return "Move heap allocated array temporaries to the stack"; +} + +void StackArraysPass::runOnOperation() { + mlir::ModuleOp mod = getOperation(); + + mod.walk([this](mlir::func::FuncOp func) { runOnFunc(func); }); +} + +void StackArraysPass::runOnFunc(mlir::Operation *func) { + assert(mlir::isa(func)); + + auto &analysis = getAnalysis(); + const auto &candidateOps = analysis.getCandidateOps(func); + if (analysis.hasErrors()) { + signalPassFailure(); + return; + } + + if (candidateOps.empty()) + return; + runCount += candidateOps.size(); + + mlir::MLIRContext &context = getContext(); + mlir::RewritePatternSet patterns(&context); + mlir::ConversionTarget target(context); + + target.addLegalDialect(); + target.addDynamicallyLegalOp([&](fir::AllocMemOp alloc) { + return !candidateOps.count(alloc.getOperation()); + }); + + patterns.insert(&context, candidateOps); + if (mlir::failed( + mlir::applyPartialConversion(func, target, std::move(patterns)))) { + mlir::emitError(func->getLoc(), "error in stack arrays optimization\n"); + signalPassFailure(); + } +} + +std::unique_ptr fir::createStackArraysPass() { + return std::make_unique(); +} diff --git a/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90 b/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90 --- a/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90 +++ b/flang/test/Lower/HLFIR/allocatable-and-pointer-status-change.f90 @@ -20,7 +20,7 @@ ! CHECK: %[[VAL_12:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_12]] : index ! CHECK: %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_11]], %[[VAL_12]] : index -! CHECK: %[[VAL_15:.*]] = fir.allocmem !fir.array>(%[[VAL_2]] : index), %[[VAL_14]] {uniq_name = "_QFallocationEx.alloc"} +! CHECK: %[[VAL_15:.*]] = fir.allocmem !fir.array>(%[[VAL_2]] : index), %[[VAL_14]] {fir.must_be_heap = true, uniq_name = "_QFallocationEx.alloc"} ! CHECK: %[[VAL_16:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_17:.*]] = fir.embox %[[VAL_15]](%[[VAL_16]]) typeparams %[[VAL_2]] : (!fir.heap>>, !fir.shape<1>, index) -> !fir.box>>> ! CHECK: fir.store %[[VAL_17]] to %[[VAL_3]]#1 : !fir.ref>>>> @@ -84,7 +84,7 @@ ! CHECK: %[[VAL_9:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_9]] : index ! CHECK: %[[VAL_11:.*]] = arith.select %[[VAL_10]], %[[VAL_8]], %[[VAL_9]] : index -! CHECK: %[[VAL_12:.*]] = fir.allocmem !fir.array, %[[VAL_11]] {uniq_name = "_QEa.alloc"} +! CHECK: %[[VAL_12:.*]] = fir.allocmem !fir.array, %[[VAL_11]] {fir.must_be_heap = true, uniq_name = "_QEa.alloc"} ! CHECK: %[[VAL_13:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1> ! CHECK: %[[VAL_14:.*]] = fir.embox %[[VAL_12]](%[[VAL_13]]) : (!fir.heap>, !fir.shape<1>) -> !fir.box>> ! CHECK: fir.store %[[VAL_14]] to %[[VAL_6]] : !fir.ref>>> diff --git a/flang/test/Lower/Intrinsics/c_loc.f90 b/flang/test/Lower/Intrinsics/c_loc.f90 --- a/flang/test/Lower/Intrinsics/c_loc.f90 +++ b/flang/test/Lower/Intrinsics/c_loc.f90 @@ -177,7 +177,7 @@ ! CHECK: %[[VAL_2:.*]] = fir.zero_bits !fir.ptr ! CHECK: fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref> ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> {bindc_name = "ptr", uniq_name = "_QFc_loc_non_save_pointer_scalarEptr"} -! CHECK: %[[VAL_4:.*]] = fir.allocmem i32 {uniq_name = "_QFc_loc_non_save_pointer_scalarEi.alloc"} +! CHECK: %[[VAL_4:.*]] = fir.allocmem i32 {fir.must_be_heap = true, uniq_name = "_QFc_loc_non_save_pointer_scalarEi.alloc"} ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap) -> !fir.ptr ! CHECK: fir.store %[[VAL_5]] to %[[VAL_1]] : !fir.ref> ! CHECK: %[[VAL_6:.*]] = arith.constant 10 : i32 diff --git a/flang/test/Lower/Intrinsics/system_clock.f90 b/flang/test/Lower/Intrinsics/system_clock.f90 --- a/flang/test/Lower/Intrinsics/system_clock.f90 +++ b/flang/test/Lower/Intrinsics/system_clock.f90 @@ -43,7 +43,7 @@ ! CHECK: %[[V_6:[0-9]+]] = fir.alloca i64 {bindc_name = "count_rate_", fir.target, uniq_name = "_QFssEcount_rate_"} ! CHECK: %[[V_7:[0-9]+]] = fir.convert %[[V_6]] : (!fir.ref) -> !fir.ptr ! CHECK: fir.store %[[V_7]] to %[[V_4]] : !fir.ref> - ! CHECK: %[[V_8:[0-9]+]] = fir.allocmem i64 {uniq_name = "_QFssEcount_max.alloc"} + ! CHECK: %[[V_8:[0-9]+]] = fir.allocmem i64 {fir.must_be_heap = true, uniq_name = "_QFssEcount_max.alloc"} ! CHECK: fir.store %[[V_8]] to %[[V_1]] : !fir.ref> ! CHECK: %[[V_9:[0-9]+]] = fir.load %[[V_4]] : !fir.ref> ! CHECK: %[[V_10:[0-9]+]] = fir.load %[[V_1]] : !fir.ref> diff --git a/flang/test/Transforms/stack-arrays.f90 b/flang/test/Transforms/stack-arrays.f90 new file mode 100644 --- /dev/null +++ b/flang/test/Transforms/stack-arrays.f90 @@ -0,0 +1,140 @@ +! RUN: %flang_fc1 -emit-fir %s -o - | fir-opt --array-value-copy | fir-opt --stack-arrays | FileCheck %s + +! check simple array value copy case +subroutine array_value_copy_simple(arr) + integer, intent(inout) :: arr(4) + arr(3:4) = arr(1:2) +end subroutine +! CHECK-LABEL: func.func @_QParray_value_copy_simple(%arg0: !fir.ref> +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: fir.alloca !fir.array<4xi32> +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: return +! CHECK-NEXT: } + +! check complex array value copy case +module stuff + type DerivedWithAllocatable + integer, dimension(:), allocatable :: dat + end type + + contains + subroutine array_value_copy_complex(arr) + type(DerivedWithAllocatable), intent(inout) :: arr(:) + arr(3:4) = arr(1:2) + end subroutine +end module +! CHECK: func.func +! CHECK-SAME: array_value_copy_complex +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: fir.alloca !fir.array +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: return +! CHECK-NEXT: } + +subroutine test_vector_subscripted_section_to_box(v, x) + interface + subroutine takes_box(y) + real :: y(:) + end subroutine + end interface + + integer :: v(:) + real :: x(:) + call takes_box(x(v)) +end subroutine +! CHECK: func.func +! CHECK-SAME: test_vector_subscripted_section_to_box +! CHECK-NOT: fir.allocmem +! CHECK: fir.alloca !fir.array +! CHECK-NOT: fir.allocmem +! CHECK: fir.call @_QPtakes_box +! CHECK-NOT: fir.freemem +! CHECK: return +! CHECK-NEXT: } + +subroutine call_parenthesized_arg(x) + integer :: x(100) + call bar((x)) +end subroutine +! CHECK: func.func +! CHECK-SAME: call_parenthesized_arg +! CHECK-NOT: fir.allocmem +! CHECK: fir.alloca !fir.array<100xi32> +! CHECK-NOT: fir.allocmem +! CHECK: fir.call @_QPbar +! CHECK-NOT: fir.freemem +! CHECK: return +! CHECK-NEXT: } + +subroutine where_allocatable_assignments(a, b) + integer :: a(:) + integer, allocatable :: b(:) + where(b > 0) + b = a + elsewhere + b(:) = 0 + end where +end subroutine +! TODO: broken: passing allocation through fir.result +! CHECK: func.func +! CHECK-SAME: where_allocatable_assignments +! CHECK: return +! CHECK-NEXT: } + +subroutine array_constructor(a, b) + real :: a(5), b + real, external :: f + a = [f(b), f(b+1), f(b+2), f(b+5), f(b+11)] +end subroutine +! TODO: broken: realloc +! CHECK: func.func +! CHECK-SAME: array_constructor +! CHECK: return +! CHECK-NEXT: } + +subroutine sequence(seq, n) + integer :: n, seq(n) + seq = [(i,i=1,n)] +end subroutine +! TODO: broken: realloc +! CHECK: func.func +! CHECK-SAME: sequence +! CHECK: return +! CHECK-NEXT: } + +subroutine CFGLoop(x) + integer, parameter :: k = 100, m=1000000, n = k*m + integer :: x(n) + logical :: has_error + + do i=0,m-1 + x(k*i+1:k*(i+1)) = x(k*(i+1):k*i+1:-1) + if (has_error(x, k)) stop + end do +end subroutine +! CHECK: func.func +! CHECK-SAME: cfgloop +! CHECK-NEXT: %0 = fir.alloca !fir.array<100000000xi32> +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: return +! CHECK-NEXT: } diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir new file mode 100644 --- /dev/null +++ b/flang/test/Transforms/stack-arrays.fir @@ -0,0 +1,242 @@ +// RUN: fir-opt --stack-arrays %s | FileCheck %s + +// Simplest transformation +func.func @simple() { + %0 = fir.allocmem !fir.array<42xi32> + fir.freemem %0 : !fir.heap> + return +} +// CHECK: func.func @simple() { +// CHECK-NEXT: fir.alloca !fir.array<42xi32> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// Check fir.must_be_heap allocations are not moved +func.func @must_be_heap() { + %0 = fir.allocmem !fir.array<42xi32> {fir.must_be_heap = true} + fir.freemem %0 : !fir.heap> + return +} +// CHECK: func.func @must_be_heap() { +// CHECK-NEXT: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> {fir.must_be_heap = true} +// CHECK-NEXT: fir.freemem %[[ALLOC]] : !fir.heap> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// Check the data-flow-analysis can detect cases where we aren't sure if memory +// is freed by the end of the function +func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) { + %7 = arith.constant 42 : index + %8 = fir.allocmem !fir.array, %7 {uniq_name = "_QFdfa1Earr.alloc"} + %9 = fir.load %arg0 : !fir.ref> + %10 = fir.convert %9 : (!fir.logical<4>) -> i1 + fir.if %10 { + fir.freemem %8 : !fir.heap> + } else { + } + return +} +// CHECK: func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) { +// CHECK-NEXT: %c42 = arith.constant 42 : index +// CHECK-NEXT: %0 = fir.allocmem !fir.array, %c42 {uniq_name = "_QFdfa1Earr.alloc"} +// CHECK-NEXT: %1 = fir.load %arg0 : !fir.ref> +// CHECK-NEXT: %2 = fir.convert %1 : (!fir.logical<4>) -> i1 +// CHECK-NEXT: fir.if %2 { +// CHECK-NEXT: fir.freemem %0 : !fir.heap> +// CHECK-NEXT: } else { +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } + +// Check scf.if (fir.if is not considered a branch operation) +func.func @dfa2(%arg0: i1) { + %a = fir.allocmem !fir.array<1xi8> + scf.if %arg0 { + fir.freemem %a : !fir.heap> + } else { + } + return +} +// CHECK: func.func @dfa2(%arg0: i1) { +// CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array<1xi8> +// CHECK-NEXT: scf.if %arg0 { +// CHECK-NEXT: fir.freemem %[[MEM]] : !fir.heap> +// CHECK-NEXT: } else { +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } + +// check the alloca is placed after all operands become available +func.func @placement1() { + // do some stuff with other ssa values + %1 = arith.constant 1 : index + %2 = arith.constant 2 : index + %3 = arith.addi %1, %2 : index + // operand is now available + %4 = fir.allocmem !fir.array, %3 + // ... + fir.freemem %4 : !fir.heap> + return +} +// CHECK: func.func @placement1() { +// CHECK-NEXT: %[[ONE:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[TWO:.*]] = arith.constant 2 : index +// CHECK-NEXT: %[[ARG:.*]] = arith.addi %[[ONE]], %[[TWO]] : index +// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[ARG]] +// CHECK-NEXT: return +// CHECK-NEXT: } + +// check that if there are no operands, then the alloca is placed early +func.func @placement2() { + // do some stuff with other ssa values + %1 = arith.constant 1 : index + %2 = arith.constant 2 : index + %3 = arith.addi %1, %2 : index + %4 = fir.allocmem !fir.array<42xi32> + // ... + fir.freemem %4 : !fir.heap> + return +} +// CHECK: func.func @placement2() { +// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> +// CHECK-NEXT: %[[ONE:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[TWO:.*]] = arith.constant 2 : index +// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index +// CHECK-NEXT: return +// CHECK-NEXT: } + +// check that stack allocations which must be placed in loops use stacksave +func.func @placement3() { + %c1 = arith.constant 1 : index + %c1_i32 = fir.convert %c1 : (index) -> i32 + %c2 = arith.constant 2 : index + %c10 = arith.constant 10 : index + %0:2 = fir.do_loop %arg0 = %c1 to %c10 step %c1 iter_args(%arg1 = %c1_i32) -> (index, i32) { + %3 = arith.addi %c1, %c2 : index + // operand is now available + %4 = fir.allocmem !fir.array, %3 + // ... + fir.freemem %4 : !fir.heap> + fir.result %3, %c1_i32 : index, i32 + } + return +} +// CHECK: func.func @placement3() { +// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 +// CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index +// CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : index +// CHECK-NEXT: fir.do_loop +// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index +// CHECK-NEXT: %[[SP:.*]] = fir.call @llvm.stacksave() : () -> !fir.ref +// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[SUM]] +// CHECK-NEXT: fir.call @llvm.stackrestore(%[[SP]]) +// CHECK-NEXT: fir.result +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } + +// check that stack save/restore are used in CFG loops +func.func @placement4(%arg0 : i1) { + %c1 = arith.constant 1 : index + %c1_i32 = fir.convert %c1 : (index) -> i32 + %c2 = arith.constant 2 : index + %c10 = arith.constant 10 : index + cf.br ^bb1 +^bb1: + %3 = arith.addi %c1, %c2 : index + // operand is now available + %4 = fir.allocmem !fir.array, %3 + // ... + fir.freemem %4 : !fir.heap> + cf.cond_br %arg0, ^bb1, ^bb2 +^bb2: + return +} +// CHECK: func.func @placement4(%arg0: i1) { +// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 +// CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index +// CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : index +// CHECK-NEXT: cf.br ^bb1 +// CHECK-NEXT: ^bb1: +// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index +// CHECK-NEXT: %[[SP:.*]] = fir.call @llvm.stacksave() : () -> !fir.ref +// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[SUM]] +// CHECK-NEXT: fir.call @llvm.stackrestore(%[[SP]]) : (!fir.ref) -> () +// CHECK-NEXT: cf.cond_br %arg0, ^bb1, ^bb2 +// CHECK-NEXT: ^bb2: +// CHECK-NEXT: return +// CHECK-NEXT: } + +// check that stacksave is not used when there is an intervening alloca +func.func @placement5() { + %c1 = arith.constant 1 : index + %c1_i32 = fir.convert %c1 : (index) -> i32 + %c2 = arith.constant 2 : index + %c10 = arith.constant 10 : index + %0:2 = fir.do_loop %arg0 = %c1 to %c10 step %c1 iter_args(%arg1 = %c1_i32) -> (index, i32) { + %3 = arith.addi %c1, %c2 : index + // operand is now available + %4 = fir.allocmem !fir.array, %3 + %5 = fir.alloca i32 + fir.freemem %4 : !fir.heap> + fir.result %3, %c1_i32 : index, i32 + } + return +} +// CHECK: func.func @placement5() { +// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 +// CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index +// CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : index +// CHECK-NEXT: fir.do_loop +// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index +// CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array, %[[SUM]] +// CHECK-NEXT: %[[IDX:.*]] = fir.alloca i32 +// CHECK-NEXT: fir.freemem %[[MEM]] : !fir.heap> +// CHECK-NEXT: fir.result +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } + +// check that stack save/restore are not used when the memalloc and freemem are +// in different blocks +func.func @placement6(%arg0: i1) { + %c1 = arith.constant 1 : index + %c1_i32 = fir.convert %c1 : (index) -> i32 + %c2 = arith.constant 2 : index + %c10 = arith.constant 10 : index + cf.br ^bb1 +^bb1: + %3 = arith.addi %c1, %c2 : index + // operand is now available + %4 = fir.allocmem !fir.array, %3 + // ... + cf.cond_br %arg0, ^bb2, ^bb3 +^bb2: + // ... + fir.freemem %4 : !fir.heap> + cf.br ^bb1 +^bb3: + // ... + fir.freemem %4 : !fir.heap> + cf.br ^bb1 +} +// CHECK: func.func @placement6(%arg0: i1) { +// CHECK-NEXT: %[[c1:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[c1_i32:.*]] = fir.convert %[[c1]] : (index) -> i32 +// CHECK-NEXT: %[[c2:.*]] = arith.constant 2 : index +// CHECK-NEXT: %[[c10:.*]] = arith.constant 10 : index +// CHECK-NEXT: cf.br ^bb1 +// CHECK-NEXT: ^bb1: +// CHECK-NEXT: %[[ADD:.*]] = arith.addi %[[c1]], %[[c2]] : index +// CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array, %[[ADD]] +// CHECK-NEXT: cf.cond_br %arg0, ^bb2, ^bb3 +// CHECK-NEXT: ^bb2: +// CHECK-NEXT: fir.freemem %[[MEM]] : !fir.heap> +// CHECK-NEXT: cf.br ^bb1 +// CHECK-NEXT: ^bb3: +// CHECK-NEXT: fir.freemem %[[MEM]] : !fir.heap> +// CHECK-NEXT: cf.br ^bb1 +// CHECK-NEXT: }