Index: llvm/lib/Transforms/Scalar/LoopFuse.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -101,6 +101,8 @@ STATISTIC(NotRotated, "Candidate is not rotated"); STATISTIC(OnlySecondCandidateIsGuarded, "The second candidate is guarded while the first one is not"); +STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions."); +STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions."); enum FusionDependenceAnalysisChoice { FUSION_DEPENDENCE_ANALYSIS_SCEV, @@ -183,9 +185,8 @@ OptimizationRemarkEmitter &ORE; - FusionCandidate(Loop *L, DominatorTree &DT, - const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE, - TTI::PeelingPreferences PP) + FusionCandidate(Loop *L, DominatorTree &DT, const PostDominatorTree *PDT, + OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP) : Preheader(L->getLoopPreheader()), Header(L->getHeader()), ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), Latch(L->getLoopLatch()), L(L), Valid(true), @@ -549,7 +550,6 @@ PostDominatorTree &PDT; OptimizationRemarkEmitter &ORE; AssumptionCache &AC; - const TargetTransformInfo &TTI; public: @@ -807,7 +807,7 @@ } // Cannot modify the predecessors inside the above loop as it will cause // the iterators to be nullptrs, causing memory errors. - for (Instruction *CurrentBranch: WorkList) { + for (Instruction *CurrentBranch : WorkList) { BasicBlock *Succ = CurrentBranch->getSuccessor(0); if (Succ == BB) Succ = CurrentBranch->getSuccessor(1); @@ -914,16 +914,6 @@ continue; } - if (!isSafeToMoveBefore(*FC1->Preheader, - *FC0->Preheader->getTerminator(), DT, &PDT, - &DI)) { - LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " - "instructions in preheader. Not fusing.\n"); - reportLoopFusion(*FC0, *FC1, - NonEmptyPreheader); - continue; - } - if (FC0->GuardBranch) { assert(FC1->GuardBranch && "Expecting valid FC1 guard branch"); @@ -959,6 +949,31 @@ continue; } + // If the second loop has instructions in the pre-header, attempt to + // hoist them up to the first loop's pre-header or sink them into the + // body of the second loop. + SmallVector SafeToHoist; + SmallVector SafeToSink; + // At this point, this is the last remaining legality check. + // Which means if we can make this pre-header empty, we can fuse + // these loops + if (!isEmptyPreheader(*FC1)) { + LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " + "preheader.\n"); + + // If it is not safe to hoist/sink all instructions in the + // pre-header, we cannot fuse these loops. + if (!collectMovablePreheaderInsts(*FC0, *FC1, SafeToHoist, + SafeToSink)) { + LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in " + "Fusion Candidate Pre-header.\n" + << "Not Fusing.\n"); + reportLoopFusion(*FC0, *FC1, + NonEmptyPreheader); + continue; + } + } + bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); LLVM_DEBUG(dbgs() << "\tFusion appears to be " @@ -972,6 +987,9 @@ // and profitable. At this point, start transforming the code and // perform fusion. + // Execute the hoist/sink operations on preheader instructions + movePreheaderInsts(*FC0, *FC1, SafeToHoist, SafeToSink); + LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " << *FC1 << "\n"); @@ -1022,6 +1040,74 @@ return Fused; } + /// Collect instructions in the \p FC1 Preheader that can be hoisted + /// to the \p FC0 Preheader or sunk into the \p FC1 Body + bool collectMovablePreheaderInsts( + const FusionCandidate &FC0, const FusionCandidate &FC1, + SmallVector &SafeToHoist, + SmallVector &SafeToSink) const { + BasicBlock *FC1Preheader = FC1.Preheader; + for (Instruction &I : *FC1Preheader) { + // Can't move a branch + if (&I == FC1Preheader->getTerminator()) + continue; + // If the instruction has side-effects, give up. + // TODO: The case of mayReadFromMemory we can handle but requires + // additional work with a dependence analysis so for now we give + // up on memory reads. + if (I.mayHaveSideEffects() || I.mayReadFromMemory()) { + LLVM_DEBUG(dbgs() << "Inst: " << I << " may have side-effects.\n"); + return false; + } + + LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n"); + + // First check if can be hoisted + // If the operands of this instruction dominate the FC0 Preheader + // target block, then it is safe to move them to the end of the FC0 + const BasicBlock *FC0PreheaderTarget = + FC0.Preheader->getSingleSuccessor(); + assert(FC0PreheaderTarget && + "Expected single successor for loop preheader."); + bool CanHoistInst = true; + for (Use &Op : I.operands()) { + if (auto *OpInst = dyn_cast(Op)) { + bool OpHoisted = is_contained(SafeToHoist, OpInst); + // Check if we have already decided to hoist this operand. In this + // case, it does not dominate FC0 *yet*, but will after we hoist it. + if (!(OpHoisted || DT.dominates(OpInst, FC0PreheaderTarget))) { + CanHoistInst = false; + break; + } + } + } + if (CanHoistInst) { + SafeToHoist.push_back(&I); + LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n"); + } else { + LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n"); + + for (User *U : I.users()) { + if (auto *UI{dyn_cast(U)}) { + // Cannot sink if user in loop + // If FC1 has phi users of this value, we cannot sink it into FC1. + if (FC1.L->contains(UI)) { + // Cannot hoist or sink this instruction. No hoisting/sinking + // should take place, loops should not fuse + LLVM_DEBUG(dbgs() << "\tCould not sink.\n"); + return false; + } + } + } + SafeToSink.push_back(&I); + LLVM_DEBUG(dbgs() << "\tSafe to sink.\n"); + } + } + LLVM_DEBUG( + dbgs() << "All preheader instructions could be sunk or hoisted!\n"); + return true; + } + /// Rewrite all additive recurrences in a SCEV to use a new loop. class AddRecLoopReplacer : public SCEVRewriteVisitor { public: @@ -1235,6 +1321,47 @@ return FC0.ExitBlock == FC1.getEntryBlock(); } + bool isEmptyPreheader(const FusionCandidate &FC) const { + return FC.Preheader->size() == 1; + } + + /// Hoist \p FC1 Preheader instructions to \p FC0 Preheader + /// and sink others into the body of \p FC1. + void movePreheaderInsts(const FusionCandidate &FC0, + const FusionCandidate &FC1, + SmallVector &HoistInsts, + SmallVector &SinkInsts) const { + + // All preheader instructions except the branch must be hoisted or sunk + assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - 1 && + "Attempting to sink and hoist preheader instructions, but not all " + "the preheader instructions are accounted for."); + + NumHoistedInsts += HoistInsts.size(); + NumSunkInsts += SinkInsts.size(); + + LLVM_DEBUG(if (VerboseFusionDebugging) { + if (!HoistInsts.empty()) + dbgs() << "Hoisting: \n"; + for (Instruction *I : HoistInsts) + dbgs() << *I << "\n"; + if (!SinkInsts.empty()) + dbgs() << "Sinking: \n"; + for (Instruction *I : SinkInsts) + dbgs() << *I << "\n"; + }); + + for (Instruction *I : HoistInsts) { + assert(I->getParent() == FC1.Preheader); + I->moveBefore(FC0.Preheader->getTerminator()); + } + // insert instructions in reverse order to maintain dominance relationship + for (Instruction *I : reverse(SinkInsts)) { + assert(I->getParent() == FC1.Preheader); + I->moveBefore(&*FC1.ExitBlock->getFirstInsertionPt()); + } + } + /// Determine if two fusion candidates have identical guards /// /// This method will determine if two fusion candidates have the same guards. @@ -1838,6 +1965,7 @@ bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; + auto &LI = getAnalysis().getLoopInfo(); auto &DT = getAnalysis().getDomTree(); auto &DI = getAnalysis().getDI(); Index: llvm/test/Transforms/LoopFusion/hoist_preheader.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFusion/hoist_preheader.ll @@ -0,0 +1,33 @@ +; RUN: opt -S -loop-fusion < %s | FileCheck %s + +define void @hoist_preheader(i32 %N) { + +; CHECK:pre1: +; CHECK-NEXT: %hoistme = add i32 1, %N +; CHECK-NEXT: %hoistme2 = add i32 1, %hoistme +; CHECK-NEXT: br label %body1 +pre1: + br label %body1 + +; CHECK: body1: +; CHECK-NOT: %hoistme +body1: ; preds = %pre1, %body1 + %i = phi i32 [%i_next, %body1], [0, %pre1] + %i_next = add i32 1, %i + %cond = icmp ne i32 %i, %N + br i1 %cond, label %body1, label %pre2 + +pre2: + %hoistme = add i32 1, %N + %hoistme2 = add i32 1, %hoistme + br label %body2 + +body2: ; preds = %pre2, %body2 + %i2 = phi i32 [%i_next2, %body2], [0, %pre2] + %i_next2 = add i32 1, %i2 + %cond2 = icmp ne i32 %i2, %N + br i1 %cond2, label %body2, label %exit + +exit: + ret void +} Index: llvm/test/Transforms/LoopFusion/no_sink_hoist.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFusion/no_sink_hoist.ll @@ -0,0 +1,39 @@ +; RUN: opt -S -loop-simplify -loop-fusion -debug-only=loop-fusion < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; CHECK: Could not hoist/sink all instructions + +define void @sink_preheader(i32 %N) { +; CHECK:pre1: +; CHECK-NEXT: br label %body1 +pre1: + br label %body1 + +; CHECK:body1: +; CHECK-NOT: %stay = +body1: ; preds = %pre1, %body1 + %i = phi i32 [%i_next, %body1], [0, %pre1] + %i_next = add i32 1, %i + %cond = icmp ne i32 %i, %N + %barrier = add i32 1, %N + br i1 %cond, label %body1, label %pre2 + +; CHECK:pre2: +; CHECK: %stay = +pre2: + %stay = add i32 1, %barrier + br label %body2 + +; CHECK: body2: +; CHECK-NOT: %stay = +body2: ; preds = %pre2, %body2 + %i2 = phi i32 [%i_next2, %body2], [0, %pre2] + %i_next2 = add i32 1, %i2 + %cond2 = icmp ne i32 %i2, %N + %barrier2 = add i32 1, %stay + br i1 %cond2, label %body2, label %exit + +; CHECK: exit: +; CHECK-NOT: %stay = +exit: + ret void +} Index: llvm/test/Transforms/LoopFusion/no_sink_hoist_inner_barrier.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFusion/no_sink_hoist_inner_barrier.ll @@ -0,0 +1,48 @@ +; RUN: opt -S -loop-fusion -debug-only=loop-fusion < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +; CHECK: Could not hoist/sink all instructions + +define void @sink_preheader(i32 %N) { +; CHECK:pre1: +; CHECK-NEXT: br label %body1 +pre1: + br label %body1 + +; CHECK:body1: +; CHECK-NOT: %no_hoist = +; CHECK-NOT: %no_hoist_sink = +; CHECK-NOT: %no_sink = +body1: ; preds = %pre1, %body1 + %i = phi i32 [%i_next, %body1], [0, %pre1] + %i_next = add i32 1, %i + %cond = icmp ne i32 %i, %N + %barrier = add i32 1, %N + br i1 %cond, label %body1, label %pre2 + +; CHECK:pre2: +; CHECK: %no_hoist = +; CHECK: %no_hoist_sink = +; CHECK: %no_sink = +pre2: + %no_hoist = add i32 1, %barrier + %no_hoist_sink = add i32 1, %no_hoist + %no_sink = add i32 1, %no_hoist_sink + br label %body2 + +; CHECK: body2: +; CHECK-NOT: %no_hoist = +; CHECK-NOT: %no_hoist_sink = +; CHECK-NOT: %no_sink = +body2: ; preds = %pre2, %body2 + %i2 = phi i32 [%i_next2, %body2], [0, %pre2] + %i_next2 = add i32 1, %i2 + %cond2 = icmp ne i32 %i2, %N + %barrier2 = add i32 1, %no_sink + br i1 %cond2, label %body2, label %exit + +; CHECK: exit: +; CHECK-NOT: %stay = +exit: + ret void +} Index: llvm/test/Transforms/LoopFusion/no_sink_hoist_store.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFusion/no_sink_hoist_store.ll @@ -0,0 +1,41 @@ +; RUN: opt -S -loop-simplify -loop-fusion -debug-only=loop-fusion < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; CHECK: have side-effects. +; CHECK: Could not hoist/sink all instructions + +@A = common global [100 x i32] zeroinitializer, align 16 +define void @sink_preheader(i32 %N) { +; CHECK:pre1: +; CHECK-NEXT: br label %body1 +pre1: + br label %body1 + +; CHECK:body1: +; CHECK-NOT: %stay = +body1: ; preds = %pre1, %body1 + %i = phi i32 [%i_next, %body1], [0, %pre1] + %i_next = add i32 1, %i + %cond = icmp ne i32 %i, %N + br i1 %cond, label %body1, label %pre2 + +; CHECK:pre2: +; CHECK: %ptr = alloca i32 +; CHECK: store i32 3, i32* %ptr +pre2: + %ptr = alloca i32 + store i32 3, i32* %ptr + br label %body2 + +; CHECK: body2: +; CHECK-NOT: %stay = +body2: ; preds = %pre2, %body2 + %i2 = phi i32 [%i_next2, %body2], [0, %pre2] + %i_next2 = add i32 1, %i2 + %cond2 = icmp ne i32 %i2, %N + br i1 %cond2, label %body2, label %exit + +; CHECK: exit: +; CHECK-NOT: %stay = +exit: + ret void +} Index: llvm/test/Transforms/LoopFusion/no_sink_hoist_unknown_function.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFusion/no_sink_hoist_unknown_function.ll @@ -0,0 +1,40 @@ +; RUN: opt -S -loop-simplify -loop-fusion -debug-only=loop-fusion < %s 2>&1 | FileCheck %s +; REQUIRES: asserts +; CHECK: may have side-effects +; CHECK: Could not hoist/sink all instructions + +declare void @unknown_func() + +define void @sink_preheader(i32 %N) { +; CHECK:pre1: +; CHECK-NEXT: br label %body1 +pre1: + br label %body1 + +; CHECK:body1: +; CHECK-NOT: %stay = +body1: ; preds = %pre1, %body1 + %i = phi i32 [%i_next, %body1], [0, %pre1] + %i_next = add i32 1, %i + %cond = icmp ne i32 %i, %N + br i1 %cond, label %body1, label %pre2 + +; CHECK:pre2: +; CHECK-NEXT: call void @unknown_func() +pre2: + call void @unknown_func() + br label %body2 + +; CHECK: body2: +; CHECK-NOT: %stay = +body2: ; preds = %pre2, %body2 + %i2 = phi i32 [%i_next2, %body2], [0, %pre2] + %i_next2 = add i32 1, %i2 + %cond2 = icmp ne i32 %i2, %N + br i1 %cond2, label %body2, label %exit + +; CHECK: exit: +; CHECK-NOT: %stay = +exit: + ret void +} Index: llvm/test/Transforms/LoopFusion/simple.ll =================================================================== --- llvm/test/Transforms/LoopFusion/simple.ll +++ llvm/test/Transforms/LoopFusion/simple.ll @@ -487,8 +487,9 @@ } ; Test that `%add` cannot be moved to basic block entry, as it uses %i, which -; defined after basic block entry. And the two loops for.first and for.second -; are not fused. +; defined after basic block entry. It also cannot be moved to for.second.exit +; since it is used in for.second. Check also that the two loops for.first and +; for.second are not fused. define i64 @unsafe_preheader(i32* %A, i64 %x) { ; CHECK-LABEL: @unsafe_preheader( @@ -505,7 +506,7 @@ ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[X:%.*]], [[I]] ; CHECK-NEXT: br label [[FOR_SECOND:%.*]] ; CHECK: for.second: -; CHECK-NEXT: [[J:%.*]] = phi i64 [ 0, [[FOR_FIRST_EXIT]] ], [ [[INC_J:%.*]], [[FOR_SECOND]] ] +; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[ADD]], [[FOR_FIRST_EXIT]] ], [ [[INC_J:%.*]], [[FOR_SECOND]] ] ; CHECK-NEXT: [[AJ:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[J]] ; CHECK-NEXT: store i32 2, i32* [[AJ]], align 4 ; CHECK-NEXT: [[INC_J]] = add nsw i64 [[J]], 1 @@ -530,7 +531,7 @@ br label %for.second for.second: - %j = phi i64 [ 0, %for.first.exit ], [ %inc.j, %for.second ] + %j = phi i64 [ %add, %for.first.exit ], [ %inc.j, %for.second ] %Aj = getelementptr inbounds i32, i32* %A, i64 %j store i32 2, i32* %Aj, align 4 %inc.j = add nsw i64 %j, 1 Index: llvm/test/Transforms/LoopFusion/sink_preheader.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopFusion/sink_preheader.ll @@ -0,0 +1,46 @@ +; RUN: opt -S -loop-fusion < %s | FileCheck %s + +define void @sink_preheader(i32 %N) { +; CHECK-LABEL: @sink_preheader( +; CHECK-NEXT: pre1: +; CHECK-NEXT: br label [[BODY1:%.*]] +; CHECK: body1: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[BODY1]] ], [ 0, [[PRE1:%.*]] ] +; CHECK-NEXT: [[I2:%.*]] = phi i32 [ [[I_NEXT2:%.*]], [[BODY1]] ], [ 0, [[PRE1]] ] +; CHECK-NEXT: [[I_NEXT]] = add i32 1, [[I]] +; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[I]], [[N:%.*]] +; CHECK-NEXT: [[BARRIER:%.*]] = add i32 1, [[N]] +; CHECK-NEXT: [[I_NEXT2]] = add i32 1, [[I2]] +; CHECK-NEXT: [[COND2:%.*]] = icmp ne i32 [[I2]], [[N]] +; CHECK-NEXT: br i1 [[COND2]], label [[BODY1]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: [[SINKME:%.*]] = add i32 1, [[BARRIER]] +; CHECK-NEXT: [[SINKME2:%.*]] = add i32 1, [[BARRIER]] +; CHECK-NEXT: [[SINKME3:%.*]] = add i32 1, [[SINKME2]] +; CHECK-NEXT: ret void +; +pre1: + br label %body1 + +body1: ; preds = %pre1, %body1 + %i = phi i32 [%i_next, %body1], [0, %pre1] + %i_next = add i32 1, %i + %cond = icmp ne i32 %i, %N + %barrier = add i32 1, %N + br i1 %cond, label %body1, label %pre2 + +pre2: + %sinkme = add i32 1, %barrier + %sinkme2 = add i32 1, %barrier + %sinkme3 = add i32 1, %sinkme2 + br label %body2 + +body2: ; preds = %pre2, %body2 + %i2 = phi i32 [%i_next2, %body2], [0, %pre2] + %i_next2 = add i32 1, %i2 + %cond2 = icmp ne i32 %i2, %N + br i1 %cond2, label %body2, label %exit + +exit: + ret void +}