diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -57,6 +57,7 @@ std::unique_ptr createMemoryAllocationPass(); std::unique_ptr createSimplifyIntrinsicsPass(); std::unique_ptr createAddDebugFoundationPass(); +std::unique_ptr createLoopVersioningPass(); std::unique_ptr createMemoryAllocationPass(bool dynOnHeap, std::size_t maxStackSize); diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -256,4 +256,16 @@ let constructor = "::fir::createAlgebraicSimplificationPass()"; } +def LoopVersioning : Pass<"loop-versioning", "mlir::func::FuncOp"> { + let summary = "Loop Versioning"; + let description = [{ + Loop Versioning pass adds a check and two variants of a loop when the input + array is an assumed size array, to optimise for the (often common) case where + an array has element sized strid. A fixed stride allows for the loop to be + vectorized as well as other loop optimisations. + }]; + let constructor = "::fir::createLoopVersioningPass()"; + let dependentDialects = [ "fir::FIROpsDialect" ]; +} + #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -167,7 +167,7 @@ /// /// \param pm - MLIR pass manager that will hold the pipeline definition inline void createDefaultFIROptimizerPassPipeline( - mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel) { + mlir::PassManager &pm, bool loopVersioning, llvm::OptimizationLevel optLevel = defaultOptLevel) { // simplify the IR mlir::GreedyRewriteConfig config; config.enableRegionSimplification = false; @@ -181,6 +181,12 @@ pm.addPass(fir::createSimplifyIntrinsicsPass()); pm.addPass(fir::createAlgebraicSimplificationPass(config)); } + + if (loopVersioning) { + llvm::outs() << "Creating loop versioning pass\n"; + pm.addPass(fir::createLoopVersioningPass()); + } + pm.addPass(mlir::createCSEPass()); fir::addMemoryAllocationOpt(pm); @@ -221,9 +227,9 @@ /// \param optLevel - optimization level used for creating FIR optimization /// passes pipeline inline void createMLIRToLLVMPassPipeline( - mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel) { + mlir::PassManager &pm, bool loopVersioning, llvm::OptimizationLevel optLevel = defaultOptLevel) { // Add default optimizer pass pipeline. - fir::createDefaultFIROptimizerPassPipeline(pm, optLevel); + fir::createDefaultFIROptimizerPassPipeline(pm, loopVersioning, optLevel); // Add codegen pass pipeline. fir::createDefaultFIRCodeGenPassPipeline(pm, optLevel); diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -13,6 +13,7 @@ AlgebraicSimplification.cpp SimplifyIntrinsics.cpp AddDebugFoundation.cpp + LoopVersioning.cpp DEPENDS FIRBuilder diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp new file mode 100644 --- /dev/null +++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp @@ -0,0 +1,243 @@ +//===- LoopVersioning.cpp -- improve loop performance by duplicating certain +// loops -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +/// \file +/// This pass looks for loops iterating over assumed size loops, that can +/// be optimized by "guessing" that the stride is element-sized. +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Support/FIRContext.h" +#include "flang/Optimizer/Transforms/Passes.h" +#include "flang/Runtime/entry-names.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/Matchers.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/RegionUtils.h" +#include "llvm/ADT/Optional.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include + +namespace fir { +#define GEN_PASS_DEF_LOOPVERSIONING +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir + +#define DEBUG_TYPE "flang-loop-versioning" + +namespace { + +class LoopVersioningPass + : public fir::impl::LoopVersioningBase { + +public: + void runOnOperation() override; +}; + +} // namespace + +/// @c replaceOuterUses - replace uses outside of @c op with result of @c +/// outerOp +static void replaceOuterUses(mlir::Operation *op, mlir::Operation *outerOp) { + mlir::Operation *outerParent = outerOp->getParentOp(); + for (auto &use : op->getUses()) { + auto owner = use.getOwner(); + if (owner->getParentOp() == outerParent) { + unsigned index = use.getOperandNumber(); + // TODO: need to figure out which index results to use! + owner->setOperand(index, outerOp->getResult(1)); + } + } +} + +static std::optional getAsSequenceType(mlir::Value *v) { + mlir::Type argTy = v->getType(); + if (auto boxTy = argTy.dyn_cast()) + if (!fir::isBoxNone(boxTy)) + return boxTy.getEleTy().dyn_cast(); + return {}; +} + +void LoopVersioningPass::runOnOperation() { + LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n"); + auto func = getOperation(); + + // First look for arguments with unknown size. + LLVM_DEBUG(llvm::dbgs() << "Func-name:" << func.getSymName() << "\n"); + auto args = func.getArguments(); + mlir::SmallVector argsOfInterest; + for (auto &arg : args) { + if (auto seqTy = getAsSequenceType(&arg)) { + unsigned rank = seqTy->getDimension(); + // TODO: Support > 2 dimensions. + if (rank > 0 && rank < 3 && + seqTy->getShape()[0] == fir::SequenceType::getUnknownExtent()) + argsOfInterest.push_back(&arg); + } + } + + if (argsOfInterest.empty()) + return; + + struct ArgsAndDims { + mlir::Value *arg; + fir::BoxDimsOp dims; + }; + + struct OpsWithArgs { + mlir::Operation *op; + mlir::SmallVector argsAndDims; + }; + // Now see if those arguments are used inside any loop. + mlir::SmallVector opsOfInterest; + + func.walk([&](mlir::Operation *op) { + if (fir::DoLoopOp loop = mlir::dyn_cast(op)) { + auto &body = *loop.getBody(); + mlir::SmallVector argsInLoop; + body.walk([&](mlir::Operation *op2) { + // If the current loop is an inner loop, we'll get to it later, + // so just skip over those. + if (op2->getParentOfType() != loop) + return; + for (auto operand : op2->getOperands()) { + for (auto a : argsOfInterest) { + if (*a == operand) { + // Only add if it's not already in the list. + if (std::find_if(argsInLoop.begin(), argsInLoop.end(), + [&](auto it) { return it.arg == a; }) == + argsInLoop.end()) + argsInLoop.push_back({a, nullptr}); + } + } + } + }); + + if (!argsInLoop.empty()) { + OpsWithArgs ops = {op, argsInLoop}; + opsOfInterest.push_back(ops); + } + } + }); + if (opsOfInterest.empty()) + return; + + // Ok, so we have some work to do here... + mlir::ModuleOp module = func->getParentOfType(); + fir::KindMapping kindMap = fir::getKindMapping(module); + + fir::FirOpBuilder builder{module, kindMap}; + auto loc = builder.getUnknownLoc(); + mlir::IndexType idxTy = builder.getIndexType(); + + LLVM_DEBUG(llvm::dbgs() << "Module Before transformation:"); + LLVM_DEBUG(module->dump()); + + LLVM_DEBUG(llvm::dbgs() << "opsOfInterest: " << opsOfInterest.size() << "\n"); + for (auto op : opsOfInterest) { + LLVM_DEBUG(op.op->dump()); + builder.setInsertionPoint(op.op); + + mlir::Value allCompares = nullptr; + for (auto &arg : op.argsAndDims) { + auto elementType = fir::unwrapSeqOrBoxedSeqType(arg.arg->getType()); + + // We only care about lowest order dimension. + mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, 0); + arg.dims = builder.create(loc, idxTy, idxTy, idxTy, + *arg.arg, dimIdx); + + mlir::Value elemSize = builder.createIntegerConstant( + loc, idxTy, elementType.getIntOrFloatBitWidth() / 8); + + mlir::Value cmp = builder.create( + loc, mlir::arith::CmpIPredicate::eq, arg.dims.getResult(2), elemSize); + if (!allCompares) + allCompares = cmp; + else + allCompares = + builder.create(loc, cmp, allCompares); + } + + auto ifOp = + builder.create(loc, op.op->getResultTypes(), allCompares, + /*withElse=*/true); + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + + llvm::dbgs() << "Creating cloned loop\n"; + mlir::Operation *clonedLoop = op.op->clone(); + bool changed = false; + for (auto arg : op.argsAndDims) { + fir::SequenceType::Shape newShape; + newShape.push_back(fir::SequenceType::getUnknownExtent()); + auto elementType = fir::unwrapSeqOrBoxedSeqType(arg.arg->getType()); + mlir::Type arrTy = fir::SequenceType::get(newShape, elementType); + mlir::Type boxArrTy = fir::BoxType::get(arrTy); + mlir::Type refArrTy = builder.getRefType(arrTy); + auto carg = builder.create(loc, boxArrTy, *arg.arg); + auto caddr = builder.create(loc, refArrTy, carg); + auto insPt = builder.saveInsertionPoint(); + // Use caddr instead of arg. + clonedLoop->walk([&](mlir::Operation *op) { + if (fir::CoordinateOp coop = mlir::dyn_cast(op)) { + if (coop->getOperand(0) == *arg.arg && + coop->getOperands().size() == 3) { + builder.setInsertionPoint(coop); + mlir::Value index = builder.create( + loc, builder.createConvert(loc, idxTy, coop->getOperand(1)), + builder.create( + loc, builder.createConvert(loc, idxTy, coop->getOperand(2)), + builder.createConvert(loc, idxTy, arg.dims.getResult(1)))); + auto newOp = builder.create( + loc, builder.getRefType(elementType), caddr, + mlir::ValueRange{index}); + LLVM_DEBUG(newOp->dump()); + coop->getResult(0).replaceAllUsesWith(newOp->getResult(0)); + coop->erase(); + changed = true; + } + } + // op->replaceUsesOfWith(*arg.arg, caddr); + }); + + builder.restoreInsertionPoint(insPt); + } + if (!changed) + llvm::dbgs() << "No coordinates usage?\n"; + builder.insert(clonedLoop); + auto result = builder.create(loc, clonedLoop->getResults()); + + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + op.op->remove(); + builder.insert(op.op); + result = builder.create(loc, op.op->getResults()); + replaceOuterUses(op.op, ifOp); + } + + LLVM_DEBUG(llvm::dbgs() << "After transform:\n"); + LLVM_DEBUG(module->dump()); + + LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n"); +} + +std::unique_ptr fir::createLoopVersioningPass() { + return std::make_unique(); +}