Index: llvm/lib/Transforms/Scalar/LoopFuse.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -68,6 +68,12 @@ #include "llvm/Transforms/Utils/CodeMoverUtils.h" #include "llvm/Transforms/Utils/LoopPeel.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + using namespace llvm; #define DEBUG_TYPE "loop-fusion" @@ -101,6 +107,8 @@ STATISTIC(NotRotated, "Candidate is not rotated"); STATISTIC(OnlySecondCandidateIsGuarded, "The second candidate is guarded while the first one is not"); +STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions."); +STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions."); enum FusionDependenceAnalysisChoice { FUSION_DEPENDENCE_ANALYSIS_SCEV, @@ -549,16 +557,16 @@ PostDominatorTree &PDT; OptimizationRemarkEmitter &ORE; AssumptionCache &AC; - const TargetTransformInfo &TTI; + AAResults &AA; public: LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, ScalarEvolution &SE, PostDominatorTree &PDT, OptimizationRemarkEmitter &ORE, const DataLayout &DL, - AssumptionCache &AC, const TargetTransformInfo &TTI) + AssumptionCache &AC, const TargetTransformInfo &TTI, AAResults &AA) : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), - DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} + DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI), AA(AA) {} /// This is the main entry point for loop fusion. It will traverse the /// specified function and collect candidate loops to fuse, starting at the @@ -914,16 +922,6 @@ continue; } - if (!isSafeToMoveBefore(*FC1->Preheader, - *FC0->Preheader->getTerminator(), DT, &PDT, - &DI)) { - LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " - "instructions in preheader. Not fusing.\n"); - reportLoopFusion(*FC0, *FC1, - NonEmptyPreheader); - continue; - } - if (FC0->GuardBranch) { assert(FC1->GuardBranch && "Expecting valid FC1 guard branch"); @@ -959,6 +957,32 @@ continue; } + // If the second loop has instructions in the pre-header, attempt to + // hoist them up to the first loop's pre-header or sink them into the + // body of the second loop. + if (!isEmptyPreheader(*FC1)) { + LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " + "preheader.\n"); + // At this point, this is the last remaining legality check. + // Which means if we can make this pre-header empty, we can fuse + // these loops + SmallVector SafeToHoist; + SmallVector SafeToSink; + + // If it is not safe to hoist/sink all instructions in the + // pre-header, we cannot fuse these loops. + if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink)) { + LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in " + "Fusion Candidate Pre-header.\n" + << "Not Fusing.\n"); + reportLoopFusion(*FC0, *FC1, + NonEmptyPreheader); + continue; + } + // Execute the hoist/sink operations on preheader instructions + movePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink); + } + bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); LLVM_DEBUG(dbgs() << "\tFusion appears to be " @@ -1022,6 +1046,74 @@ return Fused; } + /// Collect instructions in the \p FC1 Preheader that can be hoisted + /// to the \p FC0 Preheader or sunk into the \p FC1 Body + bool collectMovablePreheaderInsts( + const FusionCandidate &FC0, const FusionCandidate &FC1, + SmallVector &SafeToHoist, + SmallVector &SafeToSink) const { + BasicBlock *FC1Preheader = FC1.Preheader; + for (auto &I : *FC1Preheader) { + // Can't move a branch + if (&I == FC1Preheader->getTerminator()) + continue; + // If the instruction has side-effects, give up. + // TODO: The case of mayReadFromMemory we can handle but requires + // additional work with a dependence analysis so for now we give + // up on memory reads. + if (I.mayWriteToMemory() || I.mayThrow() || I.mayReadFromMemory()) { + LLVM_DEBUG(dbgs() << "Inst: " << I << " has side-effects.\n"); + return false; + } + // Compute alias set needed for canSinkOrHoinstInst + AliasSetTracker CurAST0(AA); + for (BasicBlock *BB : FC0.L->blocks()) + CurAST0.add(*BB); + + AliasSetTracker CurAST1(AA); + for (BasicBlock *BB : FC1.L->blocks()) + CurAST1.add(*BB); + CurAST1.add(*FC1Preheader); + // Default behaviour of this routine as defined in LICM decides + // to not hoist or sink debug intrinsics because it is not useful. + // For us it is useful so we make a special case here, instead of + // changing LICM code which would result in more side-effects. + if (!isa(I) && + !canSinkOrHoistInst(I, &AA, &DT, FC0.L, &CurAST0, nullptr, true) && + !canSinkOrHoistInst(I, &AA, &DT, FC1.L, &CurAST1, nullptr, true)) { + LLVM_DEBUG(dbgs() << "Inst: " << I << " cannot be sunk or hoisted.\n"); + return false; + } + // First check if can be hoisted + // If the operands of this instruction dominate the FC0 Preheader + // target block, then it is safe to move them to the end of the FC0 + BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor(); + assert(FC0PreheaderTarget && + "Expected single successor for loop preheader."); + bool OperandsDomFC0Body = true; + for (auto &Op : I.operands()) + if (Instruction *OpInst = dyn_cast(Op)) + if (!DT.dominates(OpInst, FC0PreheaderTarget)) { + OperandsDomFC0Body = false; + continue; + } + + if (OperandsDomFC0Body) + SafeToHoist.push_back(&I); + else { + // If I is not used in FC1.L, it can be sunk + for (auto U : I.users()) { + if (Instruction *UI = dyn_cast(U)) + if (!DT.dominates(FC1.ExitBlock, UI->getParent())) + return false; // Cannot sink or hoist; might as well stop here + } + // second loop does not contain users of I, safe to sink + SafeToSink.push_back(&I); + } + } + return true; + } + /// Rewrite all additive recurrences in a SCEV to use a new loop. class AddRecLoopReplacer : public SCEVRewriteVisitor { public: @@ -1235,6 +1327,39 @@ return FC0.ExitBlock == FC1.getEntryBlock(); } + bool isEmptyPreheader(const FusionCandidate &FC) const { + return FC.Preheader->size() == 1; + } + + /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader + /// and sink others into the body of \p FC1. + void movePreheaderInsts(const FusionCandidate &FC0, + const FusionCandidate &FC1, + SmallVector &HoistInsts, + SmallVector &SinkInsts) const { + LLVM_DEBUG(if (VerboseFusionDebugging) { + if (!HoistInsts.empty()) + dbgs() << "Hoisting: \n"; + for (auto I : HoistInsts) + dbgs() << *I << "\n"; + if (!SinkInsts.empty()) + dbgs() << "Sinking: \n"; + for (auto I : SinkInsts) + dbgs() << *I << "\n"; + }); + for (auto I : HoistInsts) { + assert(I->getParent() == FC1.Preheader); + NumHoistedInsts++; + I->moveBefore(FC0.Preheader->getTerminator()); + } + // insert instructions in reverse order to maintain dominance relationship + for (auto I : reverse(SinkInsts)) { + assert(I->getParent() == FC1.Preheader); + NumSunkInsts++; + I->moveBefore(&*FC1.ExitBlock->getFirstInsertionPt()); + } + } + /// Determine if two fusion candidates have identical guards /// /// This method will determine if two fusion candidates have the same guards. @@ -1820,6 +1945,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequiredID(LoopSimplifyID); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -1838,6 +1964,7 @@ bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; + auto &LI = getAnalysis().getLoopInfo(); auto &DT = getAnalysis().getDomTree(); auto &DI = getAnalysis().getDI(); @@ -1847,9 +1974,10 @@ auto &AC = getAnalysis().getAssumptionCache(F); const TargetTransformInfo &TTI = getAnalysis().getTTI(F); + auto &AA = getAnalysis().getAAResults(); const DataLayout &DL = F.getParent()->getDataLayout(); - LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); + LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI, AA); return LF.fuseLoops(F); } }; @@ -1864,9 +1992,10 @@ auto &ORE = AM.getResult(F); auto &AC = AM.getResult(F); const TargetTransformInfo &TTI = AM.getResult(F); + auto &AA = AM.getResult(F); const DataLayout &DL = F.getParent()->getDataLayout(); - LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); + LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI, AA); bool Changed = LF.fuseLoops(F); if (!Changed) return PreservedAnalyses::all(); Index: llvm/test/Transforms/LoopFusion/hoist_preheader.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFusion/hoist_preheader.ll @@ -0,0 +1,31 @@ +; RUN: opt -S -loop-fusion < %s | FileCheck %s + +define void @hoist_preheader(i32 %N) { + +; CHECK:pre1: +; CHECK-NEXT: %hoistme = add i32 1, %N +; CHECK-NEXT: br label %body1 +pre1: + br label %body1 + +; CHECK: body1: +; CHECK-NOT: %hoistme = add i32 1, %N +body1: ; preds = %pre1, %body1 + %i = phi i32 [%i_next, %body1], [0, %pre1] + %i_next = add i32 1, %i + %cond = icmp ne i32 %i, %N + br i1 %cond, label %body1, label %pre2 + +pre2: + %hoistme = add i32 1, %N + br label %body2 + +body2: ; preds = %pre2, %body2 + %i2 = phi i32 [%i_next2, %body2], [0, %pre2] + %i_next2 = add i32 1, %i2 + %cond2 = icmp ne i32 %i2, %N + br i1 %cond2, label %body2, label %exit + +exit: + ret void +} Index: llvm/test/Transforms/LoopFusion/no_sink_hoist.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFusion/no_sink_hoist.ll @@ -0,0 +1,40 @@ +;;;; RUN: opt -S -loop-fusion < %s | FileCheck %s +; RUN: opt -S -loop-fusion -debug -o %t1 < %s &> %t2 +; RUN: cat %t1 | FileCheck %s +; RUN: grep "Could not hoist/sink all instructions" %t2 + +define void @sink_preheader(i32 %N) { +; CHECK:pre1: +; CHECK-NEXT: br label %body1 +pre1: + br label %body1 + +; CHECK:body1: +; CHECK-NOT: %stay = +body1: ; preds = %pre1, %body1 + %i = phi i32 [%i_next, %body1], [0, %pre1] + %i_next = add i32 1, %i + %cond = icmp ne i32 %i, %N + %barrier = add i32 1, %N + br i1 %cond, label %body1, label %pre2 + +; CHECK:pre2: +; CHECK: %stay = +pre2: + %stay = add i32 1, %barrier + br label %body2 + +; CHECK: body2: +; CHECK-NOT: %stay = +body2: ; preds = %pre2, %body2 + %i2 = phi i32 [%i_next2, %body2], [0, %pre2] + %i_next2 = add i32 1, %i2 + %cond2 = icmp ne i32 %i2, %N + %barrier2 = add i32 1, %stay + br i1 %cond2, label %body2, label %exit + +; CHECK: exit: +; CHECK-NOT: %stay = +exit: + ret void +} Index: llvm/test/Transforms/LoopFusion/simple.ll =================================================================== --- llvm/test/Transforms/LoopFusion/simple.ll +++ llvm/test/Transforms/LoopFusion/simple.ll @@ -487,8 +487,9 @@ } ; Test that `%add` cannot be moved to basic block entry, as it uses %i, which -; defined after basic block entry. And the two loops for.first and for.second -; are not fused. +; defined after basic block entry. It also cannot be moved to for.second.exit +; since it is used in for.second. Check also that the two loops for.first and +; for.second are not fused. define i64 @unsafe_preheader(i32* %A, i64 %x) { ; CHECK-LABEL: @unsafe_preheader( @@ -505,7 +506,7 @@ ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[X:%.*]], [[I]] ; CHECK-NEXT: br label [[FOR_SECOND:%.*]] ; CHECK: for.second: -; CHECK-NEXT: [[J:%.*]] = phi i64 [ 0, [[FOR_FIRST_EXIT]] ], [ [[INC_J:%.*]], [[FOR_SECOND]] ] +; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[ADD]], [[FOR_FIRST_EXIT]] ], [ [[INC_J:%.*]], [[FOR_SECOND]] ] ; CHECK-NEXT: [[AJ:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[J]] ; CHECK-NEXT: store i32 2, i32* [[AJ]], align 4 ; CHECK-NEXT: [[INC_J]] = add nsw i64 [[J]], 1 @@ -530,7 +531,7 @@ br label %for.second for.second: - %j = phi i64 [ 0, %for.first.exit ], [ %inc.j, %for.second ] + %j = phi i64 [ %add, %for.first.exit ], [ %inc.j, %for.second ] %Aj = getelementptr inbounds i32, i32* %A, i64 %j store i32 2, i32* %Aj, align 4 %inc.j = add nsw i64 %j, 1 Index: llvm/test/Transforms/LoopFusion/sink_preheader.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFusion/sink_preheader.ll @@ -0,0 +1,38 @@ +; RUN: opt -S -loop-fusion < %s | FileCheck %s + +define void @sink_preheader(i32 %N) { + +; CHECK:pre1: +; CHECK-NEXT: br label %body1 +pre1: + br label %body1 + +; CHECK:body1: +; CHECK-NOT: %sinkme +body1: ; preds = %pre1, %body1 + %i = phi i32 [%i_next, %body1], [0, %pre1] + %i_next = add i32 1, %i + %cond = icmp ne i32 %i, %N + %barrier = add i32 1, %N + br i1 %cond, label %body1, label %pre2 + +pre2: + %sinkme = add i32 1, %barrier + %sinkme2 = add i32 1, %barrier + br label %body2 + +body2: ; preds = %pre2, %body2 + %i2 = phi i32 [%i_next2, %body2], [0, %pre2] + %i_next2 = add i32 1, %i2 + %cond2 = icmp ne i32 %i2, %N + br i1 %cond2, label %body2, label %exit + +; CHECK:exit: +; CHECK-NEXT: %sinkme = add i32 1, %barrier +; CHECK-NEXT: %sinkme2 = add i32 1, %barrier +; CHECK-NEXT: ret void +exit: + ret void + + +}