diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -62,6 +62,7 @@ std::unique_ptr createStackArraysPass(); std::unique_ptr createSimplifyIntrinsicsPass(); std::unique_ptr createAddDebugFoundationPass(); +std::unique_ptr createLoopVersioningPass(); std::unique_ptr createMemoryAllocationPass(bool dynOnHeap, std::size_t maxStackSize); diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -294,4 +294,16 @@ ]; } +def LoopVersioning : Pass<"loop-versioning", "mlir::func::FuncOp"> { + let summary = "Loop Versioning"; + let description = [{ + Loop Versioning pass adds a check and two variants of a loop when the input + array is an assumed shape array, to optimize for the (often common) case where + an array has element sized stride. The element sizes stride allows some + loops to be vectorized as well as other loop optimizations. + }]; + let constructor = "::fir::createLoopVersioningPass()"; + let dependentDialects = [ "fir::FIROpsDialect" ]; +} + #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -200,6 +200,10 @@ pm.addPass(fir::createSimplifyIntrinsicsPass()); pm.addPass(fir::createAlgebraicSimplificationPass(config)); } + + if (loopVersioning) + pm.addPass(fir::createLoopVersioningPass()); + pm.addPass(mlir::createCSEPass()); if (stackArrays) diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -16,6 +16,7 @@ AddDebugFoundation.cpp PolymorphicOpConversion.cpp OpenACC/OpenACCDataOperandConversion.cpp + LoopVersioning.cpp DEPENDS FIRDialect diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp new file mode 100644 --- /dev/null +++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp @@ -0,0 +1,313 @@ +//===- LoopVersioning.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +/// \file +/// This pass looks for loops iterating over assumed-shape arrays, that can +/// be optimized by "guessing" that the stride is element-sized. +/// +/// This is done by createing two versions of the same loop: one which assumes +/// that the elements are contiguous (stride == size of element), and one that +/// is the original generic loop. +/// +/// As a side-effect of the assumed element size stride, the array is also +/// flattened to make it a 1D array - this is because the internal array +/// structure must be either 1D or have known sizes in all dimensions - and at +/// least one of the dimensions here is already unknown. +/// +/// There are two distinct benefits here: +/// 1. The loop that iterates over the elements is somewhat simplified by the +/// constant stride calculation. +/// 2. Since the compiler can understand the size of the stride, it can use +/// vector instructions, where an unknown (at compile time) stride does often +/// prevent vector operations from being used. +/// +/// A known drawback is that the code-size is increased, in some cases that can +/// be quite substantial - 3-4x is quite plausible (this includes that the loop +/// gets vectorized, which in itself often more than doubles the size of the +/// code, because unless the loop size is known, there will be a modulo +/// vector-size remainder to deal with. +/// +/// TODO: Do we need some size limit where loops no longer get duplicated? +// Maybe some sort of cost analysis. +/// TODO: Should some loop content - for example calls to functions and +/// subroutines inhibit the versioning of the loops. Plausibly, this +/// could be part of the cost analysis above. +//===----------------------------------------------------------------------===// + +#include "flang/ISO_Fortran_binding.h" +#include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Runtime/Inquiry.h" +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Dialect/Support/FIRContext.h" +#include "flang/Optimizer/Dialect/Support/KindMapping.h" +#include "flang/Optimizer/Transforms/Passes.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/Matchers.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/RegionUtils.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#include + +namespace fir { +#define GEN_PASS_DEF_LOOPVERSIONING +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir + +#define DEBUG_TYPE "flang-loop-versioning" + +namespace { + +class LoopVersioningPass + : public fir::impl::LoopVersioningBase { + +public: + void runOnOperation() override; +}; + +} // namespace + +/// @c replaceOuterUses - replace uses outside of @c op with result of @c +/// outerOp +static void replaceOuterUses(mlir::Operation *op, mlir::Operation *outerOp) { + const mlir::Operation *outerParent = outerOp->getParentOp(); + op->replaceUsesWithIf(outerOp, [&](mlir::OpOperand &operand) { + mlir::Operation *owner = operand.getOwner(); + return outerParent == owner->getParentOp(); + }); +} + +static fir::SequenceType getAsSequenceType(mlir::Value *v) { + mlir::Type argTy = fir::unwrapPassByRefType(fir::unwrapRefType(v->getType())); + return argTy.dyn_cast(); +} + +void LoopVersioningPass::runOnOperation() { + LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n"); + mlir::func::FuncOp func = getOperation(); + + /// @c ArgInfo + /// A structure to hold an argument, the size of the argument and dimension + /// information. + struct ArgInfo { + mlir::Value *arg; + size_t size; + fir::BoxDimsOp dims[CFI_MAX_RANK]; + }; + + // First look for arguments with assumed shape = unknown extent in the lowest + // dimension. + LLVM_DEBUG(llvm::dbgs() << "Func-name:" << func.getSymName() << "\n"); + mlir::Block::BlockArgListType args = func.getArguments(); + mlir::ModuleOp module = func->getParentOfType(); + fir::KindMapping kindMap = fir::getKindMapping(module); + mlir::SmallVector argsOfInterest; + for (auto &arg : args) { + if (auto seqTy = getAsSequenceType(&arg)) { + unsigned rank = seqTy.getDimension(); + // Currently limited to 1D or 2D arrays as that seems to give good + // improvement without excessive increase in code-size, etc. + if (rank > 0 && rank < 3 && + seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent()) { + size_t typeSize = 0; + mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(arg.getType()); + if (elementType.isa() || + elementType.isa()) + typeSize = elementType.getIntOrFloatBitWidth() / 8; + else if (auto cty = elementType.dyn_cast()) + typeSize = 2 * cty.getEleType(kindMap).getIntOrFloatBitWidth() / 8; + if (typeSize) + argsOfInterest.push_back({&arg, typeSize, {}}); + else + LLVM_DEBUG(llvm::dbgs() << "Type not supported\n"); + + } else { + LLVM_DEBUG(llvm::dbgs() << "Too many dimensions\n"); + } + } + } + + if (argsOfInterest.empty()) + return; + + struct OpsWithArgs { + mlir::Operation *op; + mlir::SmallVector argsAndDims; + }; + // Now see if those arguments are used inside any loop. + mlir::SmallVector loopsOfInterest; + + func.walk([&](fir::DoLoopOp loop) { + mlir::Block &body = *loop.getBody(); + mlir::SmallVector argsInLoop; + body.walk([&](fir::CoordinateOp op) { + // The current operation could be inside another loop than + // the one we're currently processing. Skip it, we'll get + // to it later. + if (op->getParentOfType() != loop) + return; + const mlir::Value &operand = op->getOperand(0); + for (auto a : argsOfInterest) { + if (*a.arg == operand) { + // Only add if it's not already in the list. + if (std::find_if(argsInLoop.begin(), argsInLoop.end(), [&](auto it) { + return it.arg == a.arg; + }) == argsInLoop.end()) { + + argsInLoop.push_back(a); + break; + } + } + } + }); + + if (!argsInLoop.empty()) { + OpsWithArgs ops = {loop, argsInLoop}; + loopsOfInterest.push_back(ops); + } + }); + if (loopsOfInterest.empty()) + return; + + // If we get here, there are loops to process. + fir::FirOpBuilder builder{module, kindMap}; + mlir::Location loc = builder.getUnknownLoc(); + mlir::IndexType idxTy = builder.getIndexType(); + + LLVM_DEBUG(llvm::dbgs() << "Module Before transformation:"); + LLVM_DEBUG(module->dump()); + + LLVM_DEBUG(llvm::dbgs() << "loopsOfInterest: " << loopsOfInterest.size() + << "\n"); + for (auto op : loopsOfInterest) { + LLVM_DEBUG(op.op->dump()); + builder.setInsertionPoint(op.op); + + mlir::Value allCompares = nullptr; + // Ensure all of the arrays are unit-stride. + for (auto &arg : op.argsAndDims) { + + fir::SequenceType seqTy = getAsSequenceType(arg.arg); + unsigned rank = seqTy.getDimension(); + + // We only care about lowest order dimension. + for (unsigned i = 0; i < rank; i++) { + mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i); + arg.dims[i] = builder.create(loc, idxTy, idxTy, idxTy, + *arg.arg, dimIdx); + } + mlir::Value elemSize = + builder.createIntegerConstant(loc, idxTy, arg.size); + mlir::Value cmp = builder.create( + loc, mlir::arith::CmpIPredicate::eq, arg.dims[0].getResult(2), + elemSize); + if (!allCompares) { + allCompares = cmp; + } else { + allCompares = + builder.create(loc, cmp, allCompares); + } + } + + auto ifOp = + builder.create(loc, op.op->getResultTypes(), allCompares, + /*withElse=*/true); + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + + LLVM_DEBUG(llvm::dbgs() << "Creating cloned loop\n"); + mlir::Operation *clonedLoop = op.op->clone(); + bool changed = false; + for (auto &arg : op.argsAndDims) { + fir::SequenceType::Shape newShape; + newShape.push_back(fir::SequenceType::getUnknownExtent()); + auto elementType = fir::unwrapSeqOrBoxedSeqType(arg.arg->getType()); + mlir::Type arrTy = fir::SequenceType::get(newShape, elementType); + mlir::Type boxArrTy = fir::BoxType::get(arrTy); + mlir::Type refArrTy = builder.getRefType(arrTy); + auto carg = builder.create(loc, boxArrTy, *arg.arg); + auto caddr = builder.create(loc, refArrTy, carg); + auto insPt = builder.saveInsertionPoint(); + // Use caddr instead of arg. + clonedLoop->walk([&](fir::CoordinateOp coop) { + // Reduce the multi-dimensioned index to a single index. + // This is required becase fir arrays do not support multiple dimensions + // with unknown dimensions at compile time. + if (coop->getOperand(0) == *arg.arg && + coop->getOperands().size() >= 2) { + builder.setInsertionPoint(coop); + mlir::Value totalIndex = builder.createIntegerConstant(loc, idxTy, 0); + // Operand(1) = array; Operand(2) = index1; Operand(3) = index2 + for (unsigned i = coop->getOperands().size() - 1; i > 1; i--) { + mlir::Value curIndex = + builder.createConvert(loc, idxTy, coop->getOperand(i)); + // First arg is Operand2, so dims[i-2] is 0-based i-1! + mlir::Value scale = + builder.createConvert(loc, idxTy, arg.dims[i - 2].getResult(1)); + totalIndex = builder.create( + loc, totalIndex, + builder.create(loc, scale, curIndex)); + } + totalIndex = builder.create( + loc, totalIndex, + builder.createConvert(loc, idxTy, coop->getOperand(1))); + + auto newOp = builder.create( + loc, builder.getRefType(elementType), caddr, + mlir::ValueRange{totalIndex}); + LLVM_DEBUG(newOp->dump()); + coop->getResult(0).replaceAllUsesWith(newOp->getResult(0)); + coop->erase(); + changed = true; + } + }); + + builder.restoreInsertionPoint(insPt); + } + assert(changed && "Expected operations to have changed"); + + builder.insert(clonedLoop); + // Forward the result(s), if any, from the loop operation to the + // + mlir::ResultRange results = clonedLoop->getResults(); + bool hasResults = (results.size() > 0); + if (hasResults) + builder.create(loc, results); + + // Add the original loop in the else-side of the if operation. + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + replaceOuterUses(op.op, ifOp); + op.op->remove(); + builder.insert(op.op); + // Rely on "cloned loop has results, so original loop also has results". + if (hasResults) { + builder.create(loc, op.op->getResults()); + } else { + // Use an assert to check this. + assert(op.op->getResults().size() == 0 && + "Weird, the cloned loop doesn't have results, but the original " + "does?"); + } + } + + LLVM_DEBUG(llvm::dbgs() << "After transform:\n"); + LLVM_DEBUG(module->dump()); + + LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n"); +} + +std::unique_ptr fir::createLoopVersioningPass() { + return std::make_unique(); +} diff --git a/flang/test/Transforms/loop-versioning.fir b/flang/test/Transforms/loop-versioning.fir new file mode 100644 --- /dev/null +++ b/flang/test/Transforms/loop-versioning.fir @@ -0,0 +1,115 @@ +// RUN: fir-opt --loop-versioning %s | FileCheck %s + + +// subroutine sum1d(a, n) +// real*8 :: a(:) +// integer :: n +// real*8 :: sum +// integer :: i +// sum = 0 +// do i=1,n +// sum = sum + a(i) +// end do +// end subroutine sum1d +module { + func.func @sum1d(%arg0: !fir.box> {fir.bindc_name = "a"}, %arg1: !fir.ref {fir.bindc_name = "n"}) { + %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMmoduleFsum1dEi"} + %1 = fir.alloca f64 {bindc_name = "sum", uniq_name = "_QMmoduleFsum1dEsum"} + %cst = arith.constant 0.000000e+00 : f64 + fir.store %cst to %1 : !fir.ref + %c1_i32 = arith.constant 1 : i32 + %2 = fir.convert %c1_i32 : (i32) -> index + %3 = fir.load %arg1 : !fir.ref + %4 = fir.convert %3 : (i32) -> index + %c1 = arith.constant 1 : index + %5 = fir.convert %2 : (index) -> i32 + %6:2 = fir.do_loop %arg2 = %2 to %4 step %c1 iter_args(%arg3 = %5) -> (index, i32) { + fir.store %arg3 to %0 : !fir.ref + %7 = fir.load %1 : !fir.ref + %8 = fir.load %0 : !fir.ref + %9 = fir.convert %8 : (i32) -> i64 + %c1_i64 = arith.constant 1 : i64 + %10 = arith.subi %9, %c1_i64 : i64 + %11 = fir.coordinate_of %arg0, %10 : (!fir.box>, i64) -> !fir.ref + %12 = fir.load %11 : !fir.ref + %13 = arith.addf %7, %12 fastmath : f64 + fir.store %13 to %1 : !fir.ref + %14 = arith.addi %arg2, %c1 : index + %15 = fir.convert %c1 : (index) -> i32 + %16 = fir.load %0 : !fir.ref + %17 = arith.addi %16, %15 : i32 + fir.result %14, %17 : index, i32 + } + fir.store %6#1 to %0 : !fir.ref + return + } + +// Note this only checks the expected transformation, not the entire generated code: +// CHECK-LABEL: func.func @sum1d( +// CHECK-SAME: %[[ARG0:.*]]: !fir.box> {{.*}}) +// CHECK: %[[ZERO:.*]] = arith.constant 0 : index +// CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}} +// CHECK: %[[SIZE:.*]] = arith.constant 8 : index +// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS]]#2, %[[SIZE]] +// CHECK: %[[IF_RES:.*]]:2 = fir.if %[[CMP]] -> {{.*}} +// CHECK: %[[NEWARR:.*]] = fir.convert %[[ARG0]] +// CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[NEWARR]] : {{.*}} -> !fir.ref> +// CHECK: %[[LOOP_RES:.*]]:2 = fir.do_loop {{.*}} +// CHECK: %[[COORD:.*]] = fir.coordinate_of %[[BOXADDR]], %{{.*}} : (!fir.ref>, index) -> !fir.ref +// CHECK: %{{.*}} = fir.load %[[COORD]] : !fir.ref +// CHECK: fir.result %{{.*}}, %{{.*}} +// CHECK: } +// CHECK fir.result %[[LOOP_RES]]#0, %[[LOOP_RES]]#1 +// CHECK: } else { +// CHECK: %[[LOOP_RES2:.*]]:2 = fir.do_loop {{.*}} +// CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG0]], %{{.*}} : (!fir.box>, i64) -> !fir.ref +// CHECK: %{{.*}}= fir.load %[[COORD2]] : !fir.ref +// CHECK: fir.result %{{.*}}, %{{.*}} +// CHECK: } +// CHECK fir.result %[[LOOP_RES2]]#0, %[[LOOP_RES2]]#1 +// CHECK: } +// CHECK: fir.store %[[IF_RES]]#1 to %{{.*}} +// CHECK: return + +// ----- + +// Test that loop-versioning pass doesn't expand known size arrays. +func.func @sum1dfixed(%arg0: !fir.ref> {fir.bindc_name = "a"}, %arg1: !fir.ref {fir.bindc_name = "n"}) { + %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsum1dfixedEi"} + %1 = fir.alloca f64 {bindc_name = "sum", uniq_name = "_QFsum1dfixedEsum"} + %cst = arith.constant 0.000000e+00 : f64 + fir.store %cst to %1 : !fir.ref + %c1_i32 = arith.constant 1 : i32 + %2 = fir.convert %c1_i32 : (i32) -> index + %3 = fir.load %arg1 : !fir.ref + %4 = fir.convert %3 : (i32) -> index + %c1 = arith.constant 1 : index + %5 = fir.convert %2 : (index) -> i32 + %6:2 = fir.do_loop %arg2 = %2 to %4 step %c1 iter_args(%arg3 = %5) -> (index, i32) { + fir.store %arg3 to %0 : !fir.ref + %7 = fir.load %1 : !fir.ref + %8 = fir.load %0 : !fir.ref + %9 = fir.convert %8 : (i32) -> i64 + %c1_i64 = arith.constant 1 : i64 + %10 = arith.subi %9, %c1_i64 : i64 + %11 = fir.coordinate_of %arg0, %10 : (!fir.ref>, i64) -> !fir.ref + %12 = fir.load %11 : !fir.ref + %13 = arith.addf %7, %12 fastmath : f64 + fir.store %13 to %1 : !fir.ref + %14 = arith.addi %arg2, %c1 : index + %15 = fir.convert %c1 : (index) -> i32 + %16 = fir.load %0 : !fir.ref + %17 = arith.addi %16, %15 : i32 + fir.result %14, %17 : index, i32 + } + fir.store %6#1 to %0 : !fir.ref + return + } + +// CHECK-LABEL: func.func @sum1dfixed( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref> {{.*}}) +// CHECK: fir.do_loop {{.*}} +// CHECK: %[[COORD:.*]] = fir.coordinate_of %[[ARG0]], {{.*}} +// CHECK: %{{.*}} = fir.load %[[COORD]] + +} // End module