diff --git a/bolt/include/bolt/Passes/AllocCombiner.h b/bolt/include/bolt/Passes/AllocCombiner.h --- a/bolt/include/bolt/Passes/AllocCombiner.h +++ b/bolt/include/bolt/Passes/AllocCombiner.h @@ -17,6 +17,7 @@ class AllocCombinerPass : public BinaryFunctionPass { /// Stats aggregating variables uint64_t NumCombined{0}; + uint64_t DynamicCountCombined{0}; DenseSet FuncsChanged; void combineAdjustments(BinaryFunction &BF); diff --git a/bolt/include/bolt/Passes/DataflowAnalysis.h b/bolt/include/bolt/Passes/DataflowAnalysis.h --- a/bolt/include/bolt/Passes/DataflowAnalysis.h +++ b/bolt/include/bolt/Passes/DataflowAnalysis.h @@ -315,6 +315,8 @@ void run() { derived().preflight(); + if (Func.begin() == Func.end()) + return; // Initialize state for all points of the function for (BinaryBasicBlock &BB : Func) { StateTy &St = getOrCreateStateAt(BB); @@ -324,7 +326,6 @@ St = derived().getStartingStateAtPoint(Inst); } } - assert(Func.begin() != Func.end() && "Unexpected empty function"); std::queue Worklist; // TODO: Pushing this in a DFS ordering will greatly speed up the dataflow diff --git a/bolt/include/bolt/Passes/FrameAnalysis.h b/bolt/include/bolt/Passes/FrameAnalysis.h --- a/bolt/include/bolt/Passes/FrameAnalysis.h +++ b/bolt/include/bolt/Passes/FrameAnalysis.h @@ -130,7 +130,6 @@ /// Analysis stats counters uint64_t NumFunctionsNotOptimized{0}; uint64_t NumFunctionsFailedRestoreFI{0}; - uint64_t CountFunctionsNotOptimized{0}; uint64_t CountFunctionsFailedRestoreFI{0}; uint64_t CountDenominator{0}; diff --git a/bolt/include/bolt/Passes/FrameOptimizer.h b/bolt/include/bolt/Passes/FrameOptimizer.h --- a/bolt/include/bolt/Passes/FrameOptimizer.h +++ b/bolt/include/bolt/Passes/FrameOptimizer.h @@ -77,9 +77,12 @@ /// Stats aggregating variables uint64_t NumRedundantLoads{0}; uint64_t NumRedundantStores{0}; - uint64_t NumLoadsChangedToReg{0}; - uint64_t NumLoadsChangedToImm{0}; + uint64_t FreqRedundantLoads{0}; + uint64_t FreqRedundantStores{0}; + uint64_t FreqLoadsChangedToReg{0}; + uint64_t FreqLoadsChangedToImm{0}; uint64_t NumLoadsDeleted{0}; + uint64_t FreqLoadsDeleted{0}; DenseSet FuncsChanged; diff --git a/bolt/include/bolt/Passes/ShrinkWrapping.h b/bolt/include/bolt/Passes/ShrinkWrapping.h --- a/bolt/include/bolt/Passes/ShrinkWrapping.h +++ b/bolt/include/bolt/Passes/ShrinkWrapping.h @@ -310,6 +310,10 @@ /// Pass stats static std::atomic_uint64_t SpillsMovedRegularMode; static std::atomic_uint64_t SpillsMovedPushPopMode; + static std::atomic_uint64_t SpillsMovedDynamicCount; + static std::atomic_uint64_t SpillsFailedDynamicCount; + static std::atomic_uint64_t InstrDynamicCount; + static std::atomic_uint64_t StoreDynamicCount; Optional AnnotationIndex; @@ -515,7 +519,7 @@ BC.MIB->removeAnnotation(Inst, getAnnotationIndex()); } - bool perform(); + bool perform(bool HotOnly = false); static void printStats(); }; diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -2299,7 +2299,7 @@ uint64_t BBExecCount = BB->getExecutionCount(); if (BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE) continue; - TotalScore += BBExecCount; + TotalScore += BBExecCount * BB->getNumNonPseudos(); } FunctionScore = TotalScore; return FunctionScore; diff --git a/bolt/lib/Passes/AllocCombiner.cpp b/bolt/lib/Passes/AllocCombiner.cpp --- a/bolt/lib/Passes/AllocCombiner.cpp +++ b/bolt/lib/Passes/AllocCombiner.cpp @@ -101,6 +101,7 @@ BB.eraseInstruction(BB.findInstruction(Prev)); ++NumCombined; + DynamicCountCombined += BB.getKnownExecutionCount(); FuncsChanged.insert(&BF); Prev = &Inst; } @@ -116,7 +117,8 @@ }); outs() << "BOLT-INFO: Allocation combiner: " << NumCombined - << " empty spaces coalesced.\n"; + << " empty spaces coalesced (dyn count: " << DynamicCountCombined + << ").\n"; } } // end namespace bolt diff --git a/bolt/lib/Passes/FrameAnalysis.cpp b/bolt/lib/Passes/FrameAnalysis.cpp --- a/bolt/lib/Passes/FrameAnalysis.cpp +++ b/bolt/lib/Passes/FrameAnalysis.cpp @@ -527,16 +527,12 @@ } for (auto &I : BC.getBinaryFunctions()) { - uint64_t Count = I.second.getExecutionCount(); - if (Count != BinaryFunction::COUNT_NO_PROFILE) - CountDenominator += Count; + CountDenominator += I.second.getFunctionScore(); // "shouldOptimize" for passes that run after finalize if (!(I.second.isSimple() && I.second.hasCFG() && !I.second.isIgnored()) || !opts::shouldFrameOptimize(I.second)) { ++NumFunctionsNotOptimized; - if (Count != BinaryFunction::COUNT_NO_PROFILE) - CountFunctionsNotOptimized += Count; continue; } @@ -545,9 +541,7 @@ "FA breakdown", opts::TimeFA); if (!restoreFrameIndex(I.second)) { ++NumFunctionsFailedRestoreFI; - uint64_t Count = I.second.getExecutionCount(); - if (Count != BinaryFunction::COUNT_NO_PROFILE) - CountFunctionsFailedRestoreFI += Count; + CountFunctionsFailedRestoreFI += I.second.getFunctionScore(); continue; } } @@ -568,10 +562,7 @@ void FrameAnalysis::printStats() { outs() << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsNotOptimized - << " function(s) " - << format("(%.1lf%% dyn cov)", - (100.0 * CountFunctionsNotOptimized / CountDenominator)) - << " were not optimized.\n" + << " function(s) were not optimized.\n" << "BOLT-INFO: FRAME ANALYSIS: " << NumFunctionsFailedRestoreFI << " function(s) " << format("(%.1lf%% dyn cov)", diff --git a/bolt/lib/Passes/FrameOptimizer.cpp b/bolt/lib/Passes/FrameOptimizer.cpp --- a/bolt/lib/Passes/FrameOptimizer.cpp +++ b/bolt/lib/Passes/FrameOptimizer.cpp @@ -108,6 +108,7 @@ continue; ++NumRedundantLoads; + FreqRedundantLoads += BB.getKnownExecutionCount(); Changed = true; LLVM_DEBUG(dbgs() << "Redundant load instruction: "); LLVM_DEBUG(Inst.dump()); @@ -120,11 +121,12 @@ LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n"); break; } - ++NumLoadsChangedToReg; + FreqLoadsChangedToReg += BB.getKnownExecutionCount(); MIB->removeAnnotation(Inst, "FrameAccessEntry"); LLVM_DEBUG(dbgs() << "Changed operand to a reg\n"); if (MIB->isRedundantMove(Inst)) { ++NumLoadsDeleted; + FreqLoadsDeleted += BB.getKnownExecutionCount(); LLVM_DEBUG(dbgs() << "Created a redundant move\n"); // Delete it! ToErase.push_front(std::make_pair(&BB, &Inst)); @@ -136,7 +138,7 @@ if (!MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) { LLVM_DEBUG(dbgs() << "FAILED\n"); } else { - ++NumLoadsChangedToImm; + FreqLoadsChangedToImm += BB.getKnownExecutionCount(); MIB->removeAnnotation(Inst, "FrameAccessEntry"); LLVM_DEBUG(dbgs() << "Ok\n"); } @@ -199,6 +201,7 @@ continue; ++NumRedundantStores; + FreqRedundantStores += BB.getKnownExecutionCount(); Changed = true; LLVM_DEBUG(dbgs() << "Unused store instruction: "); LLVM_DEBUG(Inst.dump()); @@ -286,11 +289,16 @@ outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads << " redundant load(s) and " << NumRedundantStores << " unused store(s)\n"; - outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg - << " load(s) to use a register instead of a stack access, and " - << NumLoadsChangedToImm << " to use an immediate.\n" - << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and " - << NumRedundantStores << " store(s).\n"; + outs() << "BOLT-INFO: Frequency of redundant loads is " << FreqRedundantLoads + << " and frequency of unused stores is " << FreqRedundantStores + << "\n"; + outs() << "BOLT-INFO: Frequency of loads changed to use a register is " + << FreqLoadsChangedToReg + << " and frequency of loads changed to use an immediate is " + << FreqLoadsChangedToImm << "\n"; + outs() << "BOLT-INFO: FOP deleted " << NumLoadsDeleted + << " load(s) (dyn count: " << FreqLoadsDeleted << ") and " + << NumRedundantStores << " store(s)\n"; FA->printStats(); ShrinkWrapping::printStats(); } @@ -323,34 +331,51 @@ BC.MIB->getOrCreateAnnotationIndex("AccessesDeletedPos"); BC.MIB->getOrCreateAnnotationIndex("DeleteMe"); - ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { - if (!FA.hasFrameInfo(BF)) - return true; - - if (opts::FrameOptimization == FOP_HOT && - (BF.getKnownExecutionCount() < BC.getHotThreshold())) - return true; + std::vector> Top10Funcs; + auto LogFunc = [&](BinaryFunction &BF) { + auto Lower = std::lower_bound( + Top10Funcs.begin(), Top10Funcs.end(), BF.getKnownExecutionCount(), + [](const std::pair &Elmt, + uint64_t Value) { return Elmt.first > Value; }); + if (Lower == Top10Funcs.end() && Top10Funcs.size() >= 10) + return; + Top10Funcs.insert(Lower, + std::make_pair<>(BF.getKnownExecutionCount(), &BF)); + if (Top10Funcs.size() > 10) + Top10Funcs.resize(10); + }; + (void)LogFunc; - if (BF.getKnownExecutionCount() == 0) + ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) { + if (BF.getFunctionScore() == 0) return true; return false; }; + const bool HotOnly = opts::FrameOptimization == FOP_HOT; + ParallelUtilities::WorkFuncWithAllocTy WorkFunction = [&](BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) { DataflowInfoManager Info(BF, &RA, &FA, AllocatorId); ShrinkWrapping SW(FA, BF, Info, AllocatorId); - if (SW.perform()) { + if (SW.perform(HotOnly)) { std::lock_guard Lock(FuncsChangedMutex); FuncsChanged.insert(&BF); + LLVM_DEBUG(LogFunc(BF)); } }; ParallelUtilities::runOnEachFunctionWithUniqueAllocId( BC, ParallelUtilities::SchedulingPolicy::SP_INST_QUADRATIC, WorkFunction, SkipPredicate, "shrink-wrapping"); + + if (!Top10Funcs.empty()) { + outs() << "BOLT-INFO: top 10 functions changed by shrink wrapping:\n"; + for (const auto &Elmt : Top10Funcs) + outs() << Elmt.first << " : " << Elmt.second->getPrintName() << "\n"; + } } } // namespace bolt diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp --- a/bolt/lib/Passes/ShrinkWrapping.cpp +++ b/bolt/lib/Passes/ShrinkWrapping.cpp @@ -712,6 +712,10 @@ std::atomic_uint64_t ShrinkWrapping::SpillsMovedRegularMode{0}; std::atomic_uint64_t ShrinkWrapping::SpillsMovedPushPopMode{0}; +std::atomic_uint64_t ShrinkWrapping::SpillsMovedDynamicCount{0}; +std::atomic_uint64_t ShrinkWrapping::SpillsFailedDynamicCount{0}; +std::atomic_uint64_t ShrinkWrapping::InstrDynamicCount{0}; +std::atomic_uint64_t ShrinkWrapping::StoreDynamicCount{0}; using BBIterTy = BinaryBasicBlock::iterator; @@ -1273,16 +1277,19 @@ // Keeps info about successfully moved regs: reg index, save position and // save size std::vector> MovedRegs; + uint64_t TotalEstimatedWin = 0; for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { MCInst *BestPosSave = nullptr; - uint64_t TotalEstimatedWin = 0; - if (!isBestSavePosCold(I, BestPosSave, TotalEstimatedWin)) + uint64_t EstimatedWin = 0; + if (!isBestSavePosCold(I, BestPosSave, EstimatedWin)) continue; SmallVector RestorePoints = - doRestorePlacement(BestPosSave, I, TotalEstimatedWin); - if (RestorePoints.empty()) + doRestorePlacement(BestPosSave, I, EstimatedWin); + if (RestorePoints.empty()) { + SpillsFailedDynamicCount += EstimatedWin; continue; + } const FrameIndexEntry *FIESave = CSA.SaveFIEByReg[I]; const FrameIndexEntry *FIELoad = CSA.LoadFIEByReg[I]; @@ -1295,8 +1302,10 @@ // If we don't know stack state at this point, bail if ((SPFP.first == SPT.SUPERPOSITION || SPFP.first == SPT.EMPTY) && - (SPFP.second == SPT.SUPERPOSITION || SPFP.second == SPT.EMPTY)) + (SPFP.second == SPT.SUPERPOSITION || SPFP.second == SPT.EMPTY)) { + SpillsFailedDynamicCount += EstimatedWin; continue; + } // Operation mode: if true, will insert push/pops instead of loads/restores bool UsePushPops = validatePushPopsMode(I, BestPosSave, SaveOffset); @@ -1319,6 +1328,7 @@ scheduleOldSaveRestoresRemoval(I, UsePushPops); scheduleSaveRestoreInsertions(I, BestPosSave, RestorePoints, UsePushPops); MovedRegs.emplace_back(std::make_tuple(I, BestPosSave, SaveSize)); + TotalEstimatedWin += EstimatedWin; } // Revert push-pop mode if it failed for a single CSR @@ -1348,6 +1358,7 @@ } } } + SpillsMovedDynamicCount += TotalEstimatedWin; // Update statistics if (!UsedPushPopMode) { @@ -1941,12 +1952,36 @@ } } -bool ShrinkWrapping::perform() { +bool ShrinkWrapping::perform(bool HotOnly) { HasDeletedOffsetCFIs = BitVector(BC.MRI->getNumRegs(), false); PushOffsetByReg = std::vector(BC.MRI->getNumRegs(), 0LL); PopOffsetByReg = std::vector(BC.MRI->getNumRegs(), 0LL); DomOrder = std::vector(BC.MRI->getNumRegs(), 0); + // Update pass statistics + uint64_t TotalInstrs = 0ULL; + uint64_t TotalStoreInstrs = 0ULL; + for (BinaryBasicBlock *BB : BF.layout()) { + uint64_t BBExecCount = BB->getExecutionCount(); + if (!BBExecCount || BBExecCount == BinaryBasicBlock::COUNT_NO_PROFILE) + continue; + for (const auto &Instr : *BB) { + if (BC.MIB->isPseudo(Instr)) + continue; + if (BC.MIB->isStore(Instr)) + TotalStoreInstrs += BBExecCount; + TotalInstrs += BBExecCount; + } + } + InstrDynamicCount += TotalInstrs; + StoreDynamicCount += TotalStoreInstrs; + + if (!FA.hasFrameInfo(BF)) + return false; + + if (HotOnly && (BF.getKnownExecutionCount() < BC.getHotThreshold())) + return false; + if (BF.checkForAmbiguousJumpTables()) { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName() << ".\n"); @@ -1993,6 +2028,24 @@ outs() << "BOLT-INFO: Shrink wrapping moved " << SpillsMovedRegularMode << " spills inserting load/stores and " << SpillsMovedPushPopMode << " spills inserting push/pops\n"; + if (!InstrDynamicCount || !StoreDynamicCount) + return; + outs() << "BOLT-INFO: Shrink wrapping reduced " << SpillsMovedDynamicCount + << " store executions (" + << format("%.1lf%%", + (100.0 * SpillsMovedDynamicCount / InstrDynamicCount)) + << " total instructions executed, " + << format("%.1lf%%", + (100.0 * SpillsMovedDynamicCount / StoreDynamicCount)) + << " store instructions)\n"; + outs() << "BOLT-INFO: Shrink wrapping failed at reducing " + << SpillsFailedDynamicCount << " store executions (" + << format("%.1lf%%", + (100.0 * SpillsFailedDynamicCount / InstrDynamicCount)) + << " total instructions executed, " + << format("%.1lf%%", + (100.0 * SpillsFailedDynamicCount / StoreDynamicCount)) + << " store instructions)\n"; } // Operators necessary as a result of using MCAnnotation diff --git a/bolt/test/X86/shrinkwrapping-pop-order.s b/bolt/test/X86/shrinkwrapping-pop-order.s --- a/bolt/test/X86/shrinkwrapping-pop-order.s +++ b/bolt/test/X86/shrinkwrapping-pop-order.s @@ -34,6 +34,8 @@ # Check shrink wrapping results: # CHECK: BOLT-INFO: Shrink wrapping moved 0 spills inserting load/stores and 2 spills inserting push/pops +# CHECK: BOLT-INFO: Shrink wrapping reduced 6 store executions (28.6% total instructions executed, 100.0% store instructions) +# CHECK: BOLT-INFO: Shrink wrapping failed at reducing 0 store executions (0.0% total instructions executed, 0.0% store instructions) # Check that order is correct # CHECK: Binary Function "_start" after frame-optimizer