diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -61,6 +61,7 @@ std::unique_ptr createStackArraysPass(); std::unique_ptr createSimplifyIntrinsicsPass(); std::unique_ptr createAddDebugFoundationPass(); +std::unique_ptr createLoopVersioningPass(); std::unique_ptr createMemoryAllocationPass(bool dynOnHeap, std::size_t maxStackSize); diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -283,6 +283,17 @@ "fir::FIROpsDialect", "mlir::func::FuncDialect" ]; } - +def LoopVersioning : Pass<"loop-versioning", "mlir::func::FuncOp"> { + let summary = "Loop Versioning"; + let description = [{ + Loop Versioning pass adds a check and two variants of a loop when the input + array is an assumed size array, to optimise for the (often common) case where + an array has element sized strid. A fixed stride allows for the loop to be + vectorized as well as other loop optimisations. + }]; + let constructor = "::fir::createLoopVersioningPass()"; + let dependentDialects = [ "fir::FIROpsDialect" ]; +} + #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -185,6 +185,12 @@ pm.addPass(fir::createSimplifyIntrinsicsPass()); pm.addPass(fir::createAlgebraicSimplificationPass(config)); } + + if (loopVersioning) { + llvm::outs() << "Creating loop versioning pass\n"; + pm.addPass(fir::createLoopVersioningPass()); + } + pm.addPass(mlir::createCSEPass()); if (stackArrays) diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -15,6 +15,7 @@ SimplifyIntrinsics.cpp AddDebugFoundation.cpp PolymorphicOpConversion.cpp + LoopVersioning.cpp DEPENDS FIRBuilder diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp new file mode 100644 --- /dev/null +++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp @@ -0,0 +1,270 @@ +//===- LoopVersioning.cpp -- improve loop performance by duplicating certain +// loops -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +/// \file +/// This pass looks for loops iterating over assumed size loops, that can +/// be optimized by "guessing" that the stride is element-sized. +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Runtime/Inquiry.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Support/FIRContext.h" +#include "flang/Optimizer/Transforms/Passes.h" +#include "flang/Runtime/entry-names.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/Matchers.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/RegionUtils.h" +#include "llvm/ADT/Optional.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include + +namespace fir { +#define GEN_PASS_DEF_LOOPVERSIONING +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir + +#define DEBUG_TYPE "flang-loop-versioning" + +namespace { + +class LoopVersioningPass + : public fir::impl::LoopVersioningBase { + +public: + void runOnOperation() override; +}; + +} // namespace + +/// @c replaceOuterUses - replace uses outside of @c op with result of @c +/// outerOp +static void replaceOuterUses(mlir::Operation *op, mlir::Operation *outerOp) { + mlir::Operation *outerParent = outerOp->getParentOp(); + for (auto &use : op->getUses()) { + auto owner = use.getOwner(); + if (owner->getParentOp() == outerParent) { + unsigned index = use.getOperandNumber(); + owner->setOperand(index, outerOp->getResult(1)); + } + } +} + +static std::optional getAsSequenceType(mlir::Value *v) { + mlir::Type argTy = v->getType(); + if (auto boxTy = argTy.dyn_cast()) + if (!fir::isBoxNone(boxTy)) + return boxTy.getEleTy().dyn_cast(); + return {}; +} + +void LoopVersioningPass::runOnOperation() { + LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n"); + auto func = getOperation(); + + struct ArgInfo { + mlir::Value *arg; + size_t size; + fir::BoxDimsOp dims[6]; + }; + + // First look for arguments with unknown size. + LLVM_DEBUG(llvm::dbgs() << "Func-name:" << func.getSymName() << "\n"); + auto args = func.getArguments(); + mlir::ModuleOp module = func->getParentOfType(); + fir::KindMapping kindMap = fir::getKindMapping(module); + mlir::SmallVector argsOfInterest; + for (auto &arg : args) { + if (auto seqTy = getAsSequenceType(&arg)) { + unsigned rank = seqTy->getDimension(); + if (rank > 0 && rank < 3 && + seqTy->getShape()[0] == fir::SequenceType::getUnknownExtent()) { + size_t typeSize = 0; + auto elementType = fir::unwrapSeqOrBoxedSeqType(arg.getType()); + if (elementType.isa() || + elementType.isa()) + typeSize = elementType.getIntOrFloatBitWidth() / 8; + else if (auto cty = elementType.dyn_cast()) + typeSize = 2 * cty.getEleType(kindMap).getIntOrFloatBitWidth() / 8; + if (typeSize) + argsOfInterest.push_back({&arg, typeSize, {}}); + else + llvm::dbgs() << "Too many dimensions\n"; + } + } + } + + if (argsOfInterest.empty()) + return; + + struct OpsWithArgs { + mlir::Operation *op; + mlir::SmallVector argsAndDims; + }; + // Now see if those arguments are used inside any loop. + mlir::SmallVector opsOfInterest; + + func.walk([&](mlir::Operation *op) { + if (fir::DoLoopOp loop = mlir::dyn_cast(op)) { + auto &body = *loop.getBody(); + mlir::SmallVector argsInLoop; + body.walk([&](mlir::Operation *op2) { + // If the current loop is an inner loop, we'll get to it later, + // so just skip over those. + if (op2->getParentOfType() != loop) + return; + if (mlir::isa(op2)) { + const mlir::Value &operand = op2->getOperand(0); + for (auto a : argsOfInterest) { + if (*a.arg == operand) { + // Only add if it's not already in the list. + if (std::find_if(argsInLoop.begin(), argsInLoop.end(), + [&](auto it) { return it.arg == a.arg; }) == + argsInLoop.end()) + argsInLoop.push_back(a); + } + } + } + }); + + if (!argsInLoop.empty()) { + OpsWithArgs ops = {op, argsInLoop}; + opsOfInterest.push_back(ops); + } + } + }); + if (opsOfInterest.empty()) + return; + + // Ok, so we have some work to do here... + fir::FirOpBuilder builder{module, kindMap}; + auto loc = builder.getUnknownLoc(); + mlir::IndexType idxTy = builder.getIndexType(); + + LLVM_DEBUG(llvm::dbgs() << "Module Before transformation:"); + LLVM_DEBUG(module->dump()); + + LLVM_DEBUG(llvm::dbgs() << "opsOfInterest: " << opsOfInterest.size() << "\n"); + for (auto op : opsOfInterest) { + LLVM_DEBUG(op.op->dump()); + builder.setInsertionPoint(op.op); + + mlir::Value allCompares = nullptr; + for (auto &arg : op.argsAndDims) { + + auto seqTy = getAsSequenceType(arg.arg); + unsigned rank = seqTy->getDimension(); + + // We only care about lowest order dimension. + for (unsigned i = 0; i < rank; i++) { + mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i); + arg.dims[i] = builder.create(loc, idxTy, idxTy, idxTy, + *arg.arg, dimIdx); + } + mlir::Value elemSize = + builder.createIntegerConstant(loc, idxTy, arg.size); + mlir::Value cmp = builder.create( + loc, mlir::arith::CmpIPredicate::eq, arg.dims[0].getResult(2), + elemSize); + if (!allCompares) + allCompares = cmp; + else + allCompares = + builder.create(loc, cmp, allCompares); + } + + auto ifOp = + builder.create(loc, op.op->getResultTypes(), allCompares, + /*withElse=*/true); + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + + llvm::dbgs() << "Creating cloned loop\n"; + mlir::Operation *clonedLoop = op.op->clone(); + bool changed = false; + for (auto &arg : op.argsAndDims) { + fir::SequenceType::Shape newShape; + newShape.push_back(fir::SequenceType::getUnknownExtent()); + auto elementType = fir::unwrapSeqOrBoxedSeqType(arg.arg->getType()); + mlir::Type arrTy = fir::SequenceType::get(newShape, elementType); + mlir::Type boxArrTy = fir::BoxType::get(arrTy); + mlir::Type refArrTy = builder.getRefType(arrTy); + auto carg = builder.create(loc, boxArrTy, *arg.arg); + auto caddr = builder.create(loc, refArrTy, carg); + auto insPt = builder.saveInsertionPoint(); + // Use caddr instead of arg. + clonedLoop->walk([&](mlir::Operation *op) { + // Reduce the multi-dimensioned index to a single index. + if (fir::CoordinateOp coop = mlir::dyn_cast(op)) { + if (coop->getOperand(0) == *arg.arg && + coop->getOperands().size() >= 2) { + builder.setInsertionPoint(coop); + mlir::Value totalIndex = + builder.createIntegerConstant(loc, idxTy, 0); + // Operand(1) = array; Operand(2) = index1; Operand(3) = index2 + for (unsigned i = coop->getOperands().size() - 1; i > 1; i--) { + mlir::Value curIndex = + builder.createConvert(loc, idxTy, coop->getOperand(i)); + // First arg is Operand2, so dims[i-2] is 0-based i-1! + mlir::Value scale = builder.createConvert( + loc, idxTy, arg.dims[i - 2].getResult(1)); + totalIndex = builder.create( + loc, totalIndex, + builder.create(loc, scale, curIndex)); + } + totalIndex = builder.create( + loc, totalIndex, + builder.createConvert(loc, idxTy, coop->getOperand(1))); + + auto newOp = builder.create( + loc, builder.getRefType(elementType), caddr, + mlir::ValueRange{totalIndex}); + LLVM_DEBUG(newOp->dump()); + coop->getResult(0).replaceAllUsesWith(newOp->getResult(0)); + coop->erase(); + changed = true; + } + } + }); + + builder.restoreInsertionPoint(insPt); + } + if (!changed) { + llvm::dbgs() << "No coordinates usage?\n"; + } + builder.insert(clonedLoop); + auto result = builder.create(loc, clonedLoop->getResults()); + + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + replaceOuterUses(op.op, ifOp); + op.op->remove(); + builder.insert(op.op); + result = builder.create(loc, op.op->getResults()); + } + + LLVM_DEBUG(llvm::dbgs() << "After transform:\n"); + LLVM_DEBUG(module->dump()); + + LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n"); +} + +std::unique_ptr fir::createLoopVersioningPass() { + return std::make_unique(); +} diff --git a/flang/test/Transforms/loop-versioing.fir b/flang/test/Transforms/loop-versioing.fir new file mode 100644 --- /dev/null +++ b/flang/test/Transforms/loop-versioing.fir @@ -0,0 +1,115 @@ +// RUN: fir-opt --loop-versioning %s | FileCheck %s + + +// subroutine sum1d(a, n) +// real*8 :: a(:) +// integer :: n +// real*8 :: sum +// integer :: i +// sum = 0 +// do i=1,n +// sum = sum + a(i) +// end do +// end subroutine sum1d +module { + func.func @sum1d(%arg0: !fir.box> {fir.bindc_name = "a"}, %arg1: !fir.ref {fir.bindc_name = "n"}) { + %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMmoduleFsum1dEi"} + %1 = fir.alloca f64 {bindc_name = "sum", uniq_name = "_QMmoduleFsum1dEsum"} + %cst = arith.constant 0.000000e+00 : f64 + fir.store %cst to %1 : !fir.ref + %c1_i32 = arith.constant 1 : i32 + %2 = fir.convert %c1_i32 : (i32) -> index + %3 = fir.load %arg1 : !fir.ref + %4 = fir.convert %3 : (i32) -> index + %c1 = arith.constant 1 : index + %5 = fir.convert %2 : (index) -> i32 + %6:2 = fir.do_loop %arg2 = %2 to %4 step %c1 iter_args(%arg3 = %5) -> (index, i32) { + fir.store %arg3 to %0 : !fir.ref + %7 = fir.load %1 : !fir.ref + %8 = fir.load %0 : !fir.ref + %9 = fir.convert %8 : (i32) -> i64 + %c1_i64 = arith.constant 1 : i64 + %10 = arith.subi %9, %c1_i64 : i64 + %11 = fir.coordinate_of %arg0, %10 : (!fir.box>, i64) -> !fir.ref + %12 = fir.load %11 : !fir.ref + %13 = arith.addf %7, %12 fastmath : f64 + fir.store %13 to %1 : !fir.ref + %14 = arith.addi %arg2, %c1 : index + %15 = fir.convert %c1 : (index) -> i32 + %16 = fir.load %0 : !fir.ref + %17 = arith.addi %16, %15 : i32 + fir.result %14, %17 : index, i32 + } + fir.store %6#1 to %0 : !fir.ref + return + } + +// Note this only checks the expected transformation, not the entire generated code: +// CHECK-LABEL: func.func @sum1d( +// CHECK-SAME: %[[ARG0:.*]]: !fir.box> {{.*}}) +// CHECK: %[[ZERO:.*]] = arith.constant 0 : index +// CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}} +// CHECK: %[[SIZE:.*]] = arith.constant 8 : index +// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS]]#2, %[[SIZE]] +// CHECK: %[[IF_RES:.*]]:2 = fir.if %[[CMP]] -> {{.*}} +// CHECK: %[[NEWARR:.*]] = fir.convert %[[ARG0]] +// CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[NEWARR]] : {{.*}} -> !fir.ref> +// CHECK: %[[LOOP_RES:.*]]:2 = fir.do_loop {{.*}} +// CHECK: %[[COORD:.*]] = fir.coordinate_of %[[BOXADDR]], %{{.*}} : (!fir.ref>, index) -> !fir.ref +// CHECK: %{{.*}} = fir.load %[[COORD]] : !fir.ref +// CHECK: fir.result %{{.*}}, %{{.*}} +// CHECK: } +// CHECK fir.result %[[LOOP_RES]]#0, %[[LOOP_RES]]#1 +// CHECK: } else { +// CHECK: %[[LOOP_RES2:.*]]:2 = fir.do_loop {{.*}} +// CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG0]], %{{.*}} : (!fir.box>, i64) -> !fir.ref +// CHECK: %{{.*}}= fir.load %[[COORD2]] : !fir.ref +// CHECK: fir.result %{{.*}}, %{{.*}} +// CHECK: } +// CHECK fir.result %[[LOOP_RES2]]#0, %[[LOOP_RES2]]#1 +// CHECK: } +// CHECK: fir.store %[[IF_RES]]#1 to %{{.*}} +// CHECK: return + +// ----- + +// Test that loop-versioning pass doesn't expant known size arrays. +func.func @sum1dfixed(%arg0: !fir.ref> {fir.bindc_name = "a"}, %arg1: !fir.ref {fir.bindc_name = "n"}) { + %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsum1dfixedEi"} + %1 = fir.alloca f64 {bindc_name = "sum", uniq_name = "_QFsum1dfixedEsum"} + %cst = arith.constant 0.000000e+00 : f64 + fir.store %cst to %1 : !fir.ref + %c1_i32 = arith.constant 1 : i32 + %2 = fir.convert %c1_i32 : (i32) -> index + %3 = fir.load %arg1 : !fir.ref + %4 = fir.convert %3 : (i32) -> index + %c1 = arith.constant 1 : index + %5 = fir.convert %2 : (index) -> i32 + %6:2 = fir.do_loop %arg2 = %2 to %4 step %c1 iter_args(%arg3 = %5) -> (index, i32) { + fir.store %arg3 to %0 : !fir.ref + %7 = fir.load %1 : !fir.ref + %8 = fir.load %0 : !fir.ref + %9 = fir.convert %8 : (i32) -> i64 + %c1_i64 = arith.constant 1 : i64 + %10 = arith.subi %9, %c1_i64 : i64 + %11 = fir.coordinate_of %arg0, %10 : (!fir.ref>, i64) -> !fir.ref + %12 = fir.load %11 : !fir.ref + %13 = arith.addf %7, %12 fastmath : f64 + fir.store %13 to %1 : !fir.ref + %14 = arith.addi %arg2, %c1 : index + %15 = fir.convert %c1 : (index) -> i32 + %16 = fir.load %0 : !fir.ref + %17 = arith.addi %16, %15 : i32 + fir.result %14, %17 : index, i32 + } + fir.store %6#1 to %0 : !fir.ref + return + } + +// CHECK-LABEL: func.func @sum1dfixed( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref> {{.*}}) +// CHECK: fir.do_loop {{.*}} +// CHECK: %[[COORD:.*]] = fir.coordinate_of %[[ARG0]], {{.*}} +// CHECK: %{{.*}} = fir.load %[[COORD]] + +} // End module \ No newline at end of file