diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -55,6 +55,7 @@ std::unique_ptr createPromoteToAffinePass(); std::unique_ptr createMemoryAllocationPass(); std::unique_ptr createSimplifyIntrinsicsPass(); +std::unique_ptr createLoopVersioningPass(); std::unique_ptr createMemoryAllocationPass(bool dynOnHeap, std::size_t maxStackSize); diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -237,4 +237,15 @@ let constructor = "::fir::createAlgebraicSimplificationPass()"; } +def LoopVersioning : Pass<"loop-versioning", "mlir::func::FuncOp"> { + let summary = "Loop Versioning"; + let description = [{ + Loop Versioning pass adds a check and two variants of a loop when the input + array is an assumed size array, to optimise for the (often common) case where + an array has element sized strid. A fixed stride allows for the loop to be + vectorized as well as other loop optimisations. + }]; + let constructor = "::fir::createLoopVersioningPass()"; +} + #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -156,7 +156,7 @@ /// /// \param pm - MLIR pass manager that will hold the pipeline definition inline void createDefaultFIROptimizerPassPipeline( - mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel) { + mlir::PassManager &pm, bool loopVersioning, llvm::OptimizationLevel optLevel = defaultOptLevel) { // simplify the IR mlir::GreedyRewriteConfig config; config.enableRegionSimplification = false; @@ -170,6 +170,12 @@ pm.addPass(fir::createSimplifyIntrinsicsPass()); pm.addPass(fir::createAlgebraicSimplificationPass(config)); } + + if (loopVersioning) { + llvm::outs() << "Creating loop versioning pass\n"; + pm.addPass(fir::createLoopVersioningPass()); + } + pm.addPass(mlir::createCSEPass()); fir::addMemoryAllocationOpt(pm); @@ -209,9 +215,9 @@ /// \param optLevel - optimization level used for creating FIR optimization /// passes pipeline inline void createMLIRToLLVMPassPipeline( - mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel) { + mlir::PassManager &pm, bool loopVersioning, llvm::OptimizationLevel optLevel = defaultOptLevel) { // Add default optimizer pass pipeline. - fir::createDefaultFIROptimizerPassPipeline(pm, optLevel); + fir::createDefaultFIROptimizerPassPipeline(pm, loopVersioning, optLevel); // Add codegen pass pipeline. fir::createDefaultFIRCodeGenPassPipeline(pm); diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -131,7 +131,6 @@ opts.LoopVersioning = 1; } - for (auto *a : args.filtered(clang::driver::options::OPT_fpass_plugin_EQ)) opts.LLVMPassPlugins.push_back(a->getValue()); diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -12,6 +12,7 @@ SimplifyRegionLite.cpp AlgebraicSimplification.cpp SimplifyIntrinsics.cpp + LoopVersioning.cpp DEPENDS FIRBuilder diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp new file mode 100644 --- /dev/null +++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp @@ -0,0 +1,155 @@ +//===- LoopVersioning.cpp -- improve loop performance by duplicating certain +// loops -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +/// \file +/// This pass looks for loops iterating over assumed size loops, that can +/// be optimized by "guessing" that the stride is element-sized. +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Support/FIRContext.h" +#include "flang/Optimizer/Transforms/Passes.h" +#include "flang/Runtime/entry-names.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/Matchers.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/RegionUtils.h" +#include "llvm/ADT/Optional.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#include + +namespace fir { +#define GEN_PASS_DEF_LOOPVERSIONING +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir + +#define DEBUG_TYPE "flang-loop-versioning" + +namespace { + +class LoopVersioningPass + : public fir::impl::LoopVersioningBase { + +public: + void runOnOperation() override; +}; + +} // namespace + +void LoopVersioningPass::runOnOperation() { + LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n"); + auto *context = &getContext(); + auto func = getOperation(); + + // First look for arguments with unknown size. + LLVM_DEBUG(llvm::dbgs() << "Func-name:" << func.getSymName() << "\n"); + auto args = func.getArguments(); + mlir::SmallVector argsOfInterest; + for (auto &arg : args) { + mlir::Type argTy = arg.getType(); + if (auto boxTy = argTy.dyn_cast()) { + if (auto seqTy = boxTy.getEleTy().dyn_cast()) { + if (seqTy.hasDynamicExtents()) { + argsOfInterest.push_back(&arg); + } + } + } + } + + if (argsOfInterest.empty()) { + return; + } + + // Now see if those arguments are used inside any loop. + mlir::SmallVector opsOfInterest; + + // TODO: Do we need to find the innermost loop? Or the one operating on the + func.walk([&](mlir::Operation *op) { + if (fir::DoLoopOp loop = mlir::dyn_cast(op)) { + auto &body = *loop.getBody(); + body.walk([&](mlir::Operation *op2) { + for (auto operand : op2->getOperands()) { + for (auto a : argsOfInterest) { + if (*a == operand) { + if (std::find(opsOfInterest.begin(), opsOfInterest.end(), op) == + opsOfInterest.end()) { + opsOfInterest.push_back(op); + } + } + } + } + }); + } + }); + if (opsOfInterest.empty()) { + return; + } + + // Ok, so we have some work to do here... + mlir::ModuleOp module = func->getParentOfType(); + fir::KindMapping kindMap = fir::getKindMapping(module); + + fir::FirOpBuilder builder{module, kindMap}; + auto loc = builder.getUnknownLoc(); + mlir::IndexType idxTy = builder.getIndexType(); + + LLVM_DEBUG(llvm::dbgs() << "opsOfInterest: " << opsOfInterest.size() << "\n"); + for (auto op : opsOfInterest) { + LLVM_DEBUG(op->dump()); + // TODO: We need to use the correct argument here! + builder.setInsertionPoint(op); + + // TODO: Correct index? + mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, 0); + auto dims = builder.create(loc, idxTy, idxTy, idxTy, + *argsOfInterest[0], dimIdx); + mlir::Value elemSize = builder.createIntegerConstant(loc, idxTy, 4); + + mlir::Value cmp = builder.create( + loc, mlir::arith::CmpIPredicate::eq, dims.getResult(2), elemSize); + + auto ifOp = builder.create(loc, cmp, /*withElse=*/false); + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + + auto elementType = + fir::unwrapSeqOrBoxedSeqType(argsOfInterest[0]->getType()); + + fir::SequenceType::Shape flatShape(1, 100); + mlir::Type arrTy = fir::SequenceType::get(flatShape, elementType); + mlir::Type boxArrTy = fir::BoxType::get(arrTy); + mlir::Type refArrTy = builder.getRefType(arrTy); + + auto carg = + builder.create(loc, boxArrTy, *argsOfInterest[0]); + auto caddr = builder.create(loc, refArrTy, carg); + // Now stick the loop in there. + mlir::Operation *clonedLoop = builder.insert(op->clone()); + clonedLoop->walk([&](mlir::Operation *op) { + op->replaceUsesOfWith(*argsOfInterest[0], caddr); + }); + } + module->dump(); + + LLVM_DEBUG(llvm::dbgs() << "This one is a match\n"); + LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n"); +} + +std::unique_ptr fir::createLoopVersioningPass() { + return std::make_unique(); +} diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -273,7 +273,8 @@ pm.addPass(std::make_unique()); // Add O2 optimizer pass pipeline. - fir::createDefaultFIROptimizerPassPipeline(pm, false, llvm::OptimizationLevel::O2); + fir::createDefaultFIROptimizerPassPipeline(pm, false, + llvm::OptimizationLevel::O2); } if (mlir::succeeded(pm.run(mlirModule))) { diff --git a/flang/tools/tco/tco.cpp b/flang/tools/tco/tco.cpp --- a/flang/tools/tco/tco.cpp +++ b/flang/tools/tco/tco.cpp @@ -122,7 +122,7 @@ fir::createDefaultFIRCodeGenPassPipeline(pm); } else { // Run tco with O2 by default. - fir::createMLIRToLLVMPassPipeline(pm, false, llvm::OptimizationLevel::O2); + fir::createMLIRToLLVMPassPipeline(pm, false, llvm::OptimizationLevel::O2); } fir::addLLVMDialectToLLVMPass(pm, out.os()); }