Index: lib/CodeGen/ShrinkWrap.cpp =================================================================== --- lib/CodeGen/ShrinkWrap.cpp +++ lib/CodeGen/ShrinkWrap.cpp @@ -90,11 +90,18 @@ STATISTIC(NumCandidates, "Number of shrink-wrapping candidates"); STATISTIC(NumCandidatesDropped, "Number of shrink-wrapping candidates dropped because of frequency"); +STATISTIC(NumPostShrink, + "Number of post shrink-wrapping with splitting restore point"); static cl::opt EnableShrinkWrapOpt("enable-shrink-wrap", cl::Hidden, cl::desc("enable the shrink-wrapping pass")); +static cl::opt + EnablePostShrinkWrapOpt("enable-post-shrink-wrap", cl::init(true), + cl::Hidden, + cl::desc("enable the post shrink-wrapping")); + namespace { /// \brief Class to determine where the safe point to insert the @@ -175,6 +182,22 @@ /// this call. void updateSaveRestorePoints(MachineBasicBlock &MBB, RegScavenger *RS); + /// Try to find safe point based on dominance and block frequency without + /// any chance in IR. + bool performShrinkWrapping(MachineFunction &MF, RegScavenger *RS); + + /// Try to shrink further by splitting restore point. + bool postShrinkWrapping(bool HasCandidate, MachineFunction &MF, + RegScavenger *RS); + + /// Return ture if current restore point is splittable. + bool checkIfRestoreSplittable( + MachineBasicBlock *CurRestore, + DenseSet &ReachableByDirty, + SmallVector &DirtyPreds, + SmallVector &CleanPreds, + const TargetInstrInfo *TII, RegScavenger *RS); + /// \brief Initialize the pass for \p MF. void init(MachineFunction &MF) { RCI.runOnMachineFunction(MF); @@ -279,18 +302,243 @@ /// \brief Helper function to find the immediate (post) dominator. template static MachineBasicBlock *FindIDom(MachineBasicBlock &Block, ListOfBBs BBs, - DominanceAnalysis &Dom) { + DominanceAnalysis &Dom, bool Strict = true) { MachineBasicBlock *IDom = &Block; for (MachineBasicBlock *BB : BBs) { IDom = Dom.findNearestCommonDominator(IDom, BB); if (!IDom) break; } - if (IDom == &Block) + if (Strict && IDom == &Block) return nullptr; return IDom; } +static bool isAnalyzableBB(const TargetInstrInfo &TII, + MachineBasicBlock &Entry) { + // Check if the block is analyzable. + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector Cond; + if (TII.analyzeBranch(Entry, TBB, FBB, Cond)) + return false; + return true; +} + +static bool hasDirtyPred(DenseSet &ReachableByDirty, + MachineBasicBlock &MBB) { + for (auto PredBB : MBB.predecessors()) + if (ReachableByDirty.count(PredBB)) + return true; + return false; +} + +static void markAllReachable(DenseSet &Visited, + const MachineBasicBlock &MBB) { + SmallVector Worklist(MBB.succ_begin(), + MBB.succ_end()); + Visited.insert(&MBB); + while (!Worklist.empty()) { + MachineBasicBlock *SuccMBB = Worklist.pop_back_val(); + if (!Visited.insert(SuccMBB).second) + continue; + Worklist.append(SuccMBB->succ_begin(), SuccMBB->succ_end()); + } +} + +/// Collect blocks reachable by use or def of CSRs/FI. +static void collectBlocksReachableByDirty( + MachineFunction &MF, DenseSet &DirtyBBs, + DenseSet &ReachableByDirty) { + for (const MachineBasicBlock *MBB : DirtyBBs) { + if (ReachableByDirty.count(MBB)) + continue; + // Mark all offsprings as reachable. + markAllReachable(ReachableByDirty, *MBB); + } +} + +/// Return true if there is a clean path from SavePoint to the original Restore. +static bool +isSaveReachableThroughClean(const MachineBasicBlock *SavePoint, + SmallVector CleanPreds) { + DenseSet Visited; + SmallVector Worklist(CleanPreds.begin(), + CleanPreds.end()); + while (!Worklist.empty()) { + MachineBasicBlock *CleanBB = Worklist.pop_back_val(); + if (CleanBB == SavePoint) + return true; + if (!Visited.insert(CleanBB).second || !CleanBB->pred_size()) + continue; + Worklist.append(CleanBB->pred_begin(), CleanBB->pred_end()); + } + return false; +} + +/// Try to split a restore point if doing so can isolate the restore point +/// only for blocks reachable by use or def of CSRs/FI. +static MachineBasicBlock * +tryToSplitRestore(MachineBasicBlock *MBB, + DenseSet &ReachableByDirty, + SmallVector DirtyPreds, + SmallVector CleanPreds, + const TargetInstrInfo *TII) { + MachineFunction *MF = MBB->getParent(); + MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock(); + MF->insert(MachineFunction::iterator(MBB), NMBB); + + for (const auto LI : MBB->liveins()) + NMBB->addLiveIn(LI.PhysReg); + + TII->insertUnconditionalBranch(*NMBB, MBB, DebugLoc()); + + for (auto SuccBB : DirtyPreds) { + SuccBB->ReplaceUsesOfBlockWith(MBB, NMBB); + SuccBB->updateTerminator(); + } + NMBB->addSuccessor(MBB); + + for (auto SuccBB : CleanPreds) + SuccBB->updateTerminator(); + + return NMBB; +} + +static void +rollbackRestoreSplit(MachineFunction &MF, MachineBasicBlock *NMBB, + MachineBasicBlock *MBB, + SmallVector DirtyPreds, + SmallVector CleanPreds) { + NMBB->removeSuccessor(MBB); + for (auto SuccBB : DirtyPreds) { + SuccBB->ReplaceUsesOfBlockWith(NMBB, MBB); + SuccBB->updateTerminator(); + } + + for (auto SuccBB : CleanPreds) + SuccBB->updateTerminator(); + + NMBB->erase(NMBB->begin(), NMBB->end()); + NMBB->eraseFromParent(); +} + +bool ShrinkWrap::checkIfRestoreSplittable( + MachineBasicBlock *CurRestore, + DenseSet &ReachableByDirty, + SmallVector &DirtyPreds, + SmallVector &CleanPreds, const TargetInstrInfo *TII, + RegScavenger *RS) { + for (const MachineInstr &MI : *CurRestore) + if (useOrDefCSROrFI(MI, RS)) + return false; + + for (auto PredBB : CurRestore->predecessors()) { + if (!isAnalyzableBB(*TII, *PredBB)) + return false; + + if (ReachableByDirty.count(PredBB)) + DirtyPreds.push_back(PredBB); + else + CleanPreds.push_back(PredBB); + } + + return !(CleanPreds.empty() || DirtyPreds.empty()); +} + +bool ShrinkWrap::postShrinkWrapping(bool HasCandidate, MachineFunction &MF, + RegScavenger *RS) { + if (!EnablePostShrinkWrapOpt) + return false; + + MachineBasicBlock *InitSave = nullptr; + MachineBasicBlock *InitRestore = nullptr; + + if (HasCandidate) { + InitSave = Save; + InitRestore = Restore; + } else { + InitRestore = nullptr; + InitSave = &MF.front(); + for (MachineBasicBlock &MBB : MF) { + if (MBB.isEHFuncletEntry()) + return false; + if (MBB.isReturnBlock()) { + // Do not support multiple restore points. + if (InitRestore) + return false; + InitRestore = &MBB; + } + } + } + + if (!InitSave || !InitRestore || InitRestore == InitSave || + !MDT->dominates(InitSave, InitRestore) || + !MPDT->dominates(InitRestore, InitSave)) + return false; + + DenseSet DirtyBBs; + for (auto &MBB : MF) + for (const MachineInstr &MI : MBB) + if (useOrDefCSROrFI(MI, RS)) { + DirtyBBs.insert(&MBB); + break; + } + + // Find blocks reachable from the use of def of CSRs/FI. + DenseSet ReachableByDirty; + collectBlocksReachableByDirty(MF, DirtyBBs, ReachableByDirty); + + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + SmallVector DirtyPreds; + SmallVector CleanPreds; + if (!checkIfRestoreSplittable(InitRestore, ReachableByDirty, DirtyPreds, + CleanPreds, TII, RS)) + return false; + + // Trying to reach out to the new save point which dominate all dirty blocks. + MachineBasicBlock *NewSave = + FindIDom<>(**DirtyPreds.begin(), DirtyPreds, *MDT, false); + + while (NewSave && (hasDirtyPred(ReachableByDirty, *NewSave) || + EntryFreq < MBFI->getBlockFreq(NewSave).getFrequency())) + NewSave = FindIDom<>(**NewSave->pred_begin(), NewSave->predecessors(), *MDT, + false); + + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + if (!NewSave || NewSave == InitSave || + isSaveReachableThroughClean(NewSave, CleanPreds) || + !TFI->canUseAsPrologue(*NewSave)) + return false; + + /// Now we know that splitting a restore point can isolate the restore point + /// only for dirty blocks. + MachineBasicBlock *NewRestore = tryToSplitRestore( + InitRestore, ReachableByDirty, DirtyPreds, CleanPreds, TII); + + /// Make sure if the new restore point is valid as an epilogue, depending on + /// targets. + if (!TFI->canUseAsEpilogue(*NewRestore)) { + rollbackRestoreSplit(MF, NewRestore, InitRestore, DirtyPreds, CleanPreds); + return false; + } + + Save = NewSave; + Restore = NewRestore; + + MDT->runOnMachineFunction(MF); + MPDT->runOnMachineFunction(MF); + + assert((MDT->dominates(Save, Restore) && MPDT->dominates(Restore, Save)) && + "Incorrect save or restore point due to dominance relations"); + assert((!MLI->getLoopFor(Save) && !MLI->getLoopFor(Restore)) && + "Unexpected save or restore point in a loop"); + assert((EntryFreq >= MBFI->getBlockFreq(Save).getFrequency() && + EntryFreq >= MBFI->getBlockFreq(Restore).getFrequency()) && + "Incorrect save or restore point based on block frequency"); + ++NumPostShrink; + return true; +} + void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB, RegScavenger *RS) { // Get rid of the easy cases first. @@ -448,28 +696,7 @@ return false; } -bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF)) - return false; - - DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); - - init(MF); - - if (isIrreducibleCFG(MF, *MLI)) { - // If MF is irreducible, a block may be in a loop without - // MachineLoopInfo reporting it. I.e., we may use the - // post-dominance property in loops, which lead to incorrect - // results. Moreover, we may miss that the prologue and - // epilogue are not in the same loop, leading to unbalanced - // construction/deconstruction of the stack frame. - DEBUG(dbgs() << "Irreducible CFGs are not supported yet\n"); - return false; - } - - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - std::unique_ptr RS( - TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr); +bool ShrinkWrap::performShrinkWrapping(MachineFunction &MF, RegScavenger *RS) { for (MachineBasicBlock &MBB : MF) { DEBUG(dbgs() << "Look into: " << MBB.getNumber() << ' ' << MBB.getName() @@ -481,11 +708,11 @@ } for (const MachineInstr &MI : MBB) { - if (!useOrDefCSROrFI(MI, RS.get())) + if (!useOrDefCSROrFI(MI, RS)) continue; // Save (resp. restore) point must dominate (resp. post dominate) // MI. Look for the proper basic block for those. - updateSaveRestorePoints(MBB, RS.get()); + updateSaveRestorePoints(MBB, RS); // If we are at a point where we cannot improve the placement of // save/restore instructions, just give up. if (!ArePointsInteresting()) { @@ -537,13 +764,45 @@ break; NewBB = Restore; } - updateSaveRestorePoints(*NewBB, RS.get()); + updateSaveRestorePoints(*NewBB, RS); } while (Save && Restore); if (!ArePointsInteresting()) { ++NumCandidatesDropped; return false; } + return true; +} + +bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF)) + return false; + + DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); + + init(MF); + + if (isIrreducibleCFG(MF, *MLI)) { + // If MF is irreducible, a block may be in a loop without + // MachineLoopInfo reporting it. I.e., we may use the + // post-dominance property in loops, which lead to incorrect + // results. Moreover, we may miss that the prologue and + // epilogue are not in the same loop, leading to unbalanced + // construction/deconstruction of the stack frame. + DEBUG(dbgs() << "Irreducible CFGs are not supported yet\n"); + return false; + } + + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + std::unique_ptr RS( + TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr); + + bool Changed = false; + bool HasCandidate = performShrinkWrapping(MF, RS.get()); + Changed = postShrinkWrapping(HasCandidate, MF, RS.get()); + + if (!ArePointsInteresting()) + return false; DEBUG(dbgs() << "Final shrink wrap candidates:\nSave: " << Save->getNumber() << ' ' << Save->getName() << "\nRestore: " @@ -553,7 +812,7 @@ MFI.setSavePoint(Save); MFI.setRestorePoint(Restore); ++NumCandidates; - return false; + return Changed; } bool ShrinkWrap::isShrinkWrapEnabled(const MachineFunction &MF) { Index: test/CodeGen/AArch64/post-shrink-wrap-split-restore.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/post-shrink-wrap-split-restore.ll @@ -0,0 +1,205 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +; CHECK-LABEL: shrinkwrapme +; CHECK-LABEL: %bb.0: +; CHECK-NOT: str {{.*}} [sp{{.*}}] +; CHECK-LABEL: %bb.1: +; CHECK: str {{.*}} [sp{{.*}}] +define void @shrinkwrapme(i32 %a) { +entry: + %cmp5 = icmp sgt i32 %a, 0 + br i1 %cmp5, label %BB0, label %Exit + +BB0: + %call = call i32 @fun() + %c = icmp eq i32 %call, 0 + br i1 %c, label %BB1, label %Exit + +BB1: + %call2 = call i32 @fun() + br label %Exit + +Exit: + ret void +} + +; CHECK-LABEL: shrinkwrapme2 +; CHECK-LABEL: %bb.0: +; CHECK-NOT: str {{.*}} [sp{{.*}}] +; CHECK-LABEL: %BB03 +; CHECK: str {{.*}} [sp{{.*}}] +define void @shrinkwrapme2(i32 %a, i32* %P1, i32* %P2) { +BB00: + %cmp5 = icmp sgt i32 %a, 0 + br i1 %cmp5, label %BB01, label %Exit + +BB01: ; dirty + store i32 %a, i32* %P1 + %c1 = icmp sgt i32 %a, 1 + br i1 %c1, label %BB02, label %BB03 + +BB02: + store i32 %a, i32* %P2 + br label %BB03 + +BB03: + %call03 = call i32 @fun() + %c03 = icmp eq i32 %call03, 0 + br i1 %c03, label %BB04, label %BB05 + +BB04: + %call04 = call i32 @fun() + br label %BB05 + +BB05: + %call05 = call i32 @fun() + %c05 = icmp eq i32 %call05, 0 + br i1 %c05, label %BB06, label %BB07 + +BB06: + %call06 = call i32 @fun() + br label %Exit + +BB07: + %call07 = call i32 @fun2() + br label %Exit + + +Exit: + ret void +} + +; CHECK-LABEL: shrinkwrapme3 +; CHECK-LABEL: %bb.0: +; CHECK-NOT: str {{.*}} [sp{{.*}}] +; CHECK-LABEL: %Header.preheader +; CHECK: st{{p|r}} {{.*}} [sp{{.*}}] +define void @shrinkwrapme3(i32 %a, i32** %PP) { +BB00: + %cmp5 = icmp sgt i32 %a, 0 + %P = load i32*, i32** %PP + br i1 %cmp5, label %Header, label %Exit + +Header: + %l = load i32, i32* %P + %cl = icmp eq i32 %l, 0 + br i1 %cl, label %BB01, label %Exit + +BB01: + %call = call i32 @fun() + %c = icmp eq i32 %call, 0 + br i1 %c, label %BB02, label %Exit + +BB02: + %call2 = call i32 @fun() + %c2 = icmp eq i32 %call2, 0 + br i1 %c2, label %Exit, label %Header + +Exit: + ret void +} + + +; BB1 cannot be a new save point because it has a clean path to Exit. +; CHECK-LABEL: donotshrinkwrapme +; CHECK-LABEL: %bb.0: +; CHECK: str {{.*}} [sp{{.*}}] +define void @donotshrinkwrapme(i32 %a, i32 %v, i32 %v2) { +entry: + %cmp5 = icmp sgt i32 %a, 0 + br i1 %cmp5, label %BB0, label %Exit + +BB0: + %c = icmp eq i32 %a, 10 + br i1 %c, label %BB1, label %BB2 + +BB1: + %c1 = icmp eq i32 %v, 10 + br i1 %c1, label %BB3, label %BB2 + +BB2: + %c2 = icmp eq i32 %v2, 10 + br i1 %c2, label %BB4, label %Exit + +BB3: + %call3 = call i32 @fun() + %c3 = icmp eq i32 %call3, 0 + br label %Exit + +BB4: + %call4 = call i32 @fun2() + br label %Exit + +Exit: + ret void +} + +; No further shrinking when calling in an infinite loop. +; CHECK-LABEL: donotshrinkwrapme2 +; CHECK-LABEL: %bb.0: +; CHECK: st{{r|p}} {{.*}} [sp{{.*}}] +define void @donotshrinkwrapme2(i32 %a) { +BB00: + %cmp5 = icmp sgt i32 %a, 0 + br i1 %cmp5, label %BB01, label %InfLoop + +BB01: + %call = call i32 @fun() + %c = icmp eq i32 %call, 0 + br i1 %c, label %BB02, label %Exit + +BB02: + %call2 = call i32 @fun() + br label %Exit + +InfLoop: + %call3 = call i32 @fun() + br label %InfLoop + +Exit: + ret void +} + +; No further shrinking if save cannot dominate restore. +; CHECK-LABEL: donotshrinkwrapme3 +; CHECK-LABEL: %bb.0: +; CHECK-NOT: str {{.*}} [sp{{.*}}] + +define void @donotshrinkwrapme3(i32 %a) { +BB00: + %cmp5 = icmp sgt i32 %a, 0 + br i1 %cmp5, label %BB02, label %BB01 + +BB01: ; dirty + %call01 = call i32 @fun() + %c01 = icmp eq i32 %call01, 0 + br i1 %c01, label %BB01.1, label %Exit + +BB01.1: + call void @abort() noreturn + unreachable + +BB02: + %call02 = call i32 @fun() + %c02 = icmp eq i32 %call02, 0 + br i1 %c02, label %BB03, label %BB04 + +BB03: + %call03 = call i32 @fun() + %c03 = icmp eq i32 %call03, 0 + br i1 %c03, label %BB04, label %Exit + +BB04: + %call04 = call i32 @fun() + br label %Exit + +Exit: + ret void +} + +declare i32 @fun() +declare i32 @fun2() +declare void @abort() noreturn