diff --git a/bolt/include/bolt/Passes/MCF.h b/bolt/include/bolt/Passes/MCF.h --- a/bolt/include/bolt/Passes/MCF.h +++ b/bolt/include/bolt/Passes/MCF.h @@ -13,6 +13,7 @@ namespace bolt { class BinaryFunction; +class DataflowInfoManager; enum MCFCostFunction : char { MCF_DISABLE = 0, @@ -22,6 +23,12 @@ MCF_BLAMEFTS }; +/// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations +/// without the Usability Burden" by Diego Novillo to make basic block counts +/// equal if we show that A dominates B, B post-dominates A and they are in the +/// same loop and same loop nesting level. +void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF); + /// Fill edge counts based on the basic block count. Used in nonLBR mode when /// we only have bb count. void estimateEdgeCounts(BinaryFunction &BF); diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h --- a/bolt/include/bolt/Utils/CommandLineOpts.h +++ b/bolt/include/bolt/Utils/CommandLineOpts.h @@ -35,6 +35,7 @@ extern llvm::cl::opt BucketsPerLine; extern llvm::cl::opt DiffOnly; extern llvm::cl::opt EnableBAT; +extern llvm::cl::opt EqualizeBBCounts; extern llvm::cl::opt RemoveSymtab; extern llvm::cl::opt ExecutionCountThreshold; extern llvm::cl::opt HeatmapBlock; diff --git a/bolt/lib/Passes/FrameOptimizer.cpp b/bolt/lib/Passes/FrameOptimizer.cpp --- a/bolt/lib/Passes/FrameOptimizer.cpp +++ b/bolt/lib/Passes/FrameOptimizer.cpp @@ -17,6 +17,7 @@ #include "bolt/Passes/ShrinkWrapping.h" #include "bolt/Passes/StackAvailableExpressions.h" #include "bolt/Passes/StackReachingUses.h" +#include "bolt/Utils/CommandLineOpts.h" #include "llvm/Support/Timer.h" #include #include diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp --- a/bolt/lib/Passes/MCF.cpp +++ b/bolt/lib/Passes/MCF.cpp @@ -13,6 +13,7 @@ #include "bolt/Passes/MCF.h" #include "bolt/Core/BinaryFunction.h" #include "bolt/Passes/DataflowInfoManager.h" +#include "bolt/Utils/CommandLineOpts.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Support/CommandLine.h" #include @@ -38,15 +39,6 @@ cl::Hidden, cl::cat(BoltOptCategory)); -static cl::opt -EqualizeBBCounts("equalize-bb-counts", - cl::desc("in non-LBR mode, use same count for BBs " - "that should have equivalent count"), - cl::ZeroOrMore, - cl::init(false), - cl::Hidden, - cl::cat(BoltOptCategory)); - static cl::opt UseRArcs("mcf-use-rarcs", cl::desc("in MCF, consider the possibility of cancelling flow to balance " @@ -378,12 +370,12 @@ return LoopNestLevel; } -/// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations -/// without the Usability Burden" by Diego Novillo to make basic block counts -/// equal if we show that A dominates B, B post-dominates A and they are in the -/// same loop and same loop nesting level. -void equalizeBBCounts(BinaryFunction &BF) { - auto Info = DataflowInfoManager(BF, nullptr, nullptr); +} // end anonymous namespace + +void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF) { + if (BF.begin() == BF.end()) + return; + DominatorAnalysis &DA = Info.getDominatorAnalysis(); DominatorAnalysis &PDA = Info.getPostDominatorAnalysis(); auto &InsnToBB = Info.getInsnToBBMap(); @@ -456,8 +448,6 @@ } } -} // end anonymous namespace - void estimateEdgeCounts(BinaryFunction &BF) { EdgeWeightMap PredEdgeWeights; EdgeWeightMap SuccEdgeWeights; @@ -467,7 +457,8 @@ } if (opts::EqualizeBBCounts) { LLVM_DEBUG(BF.print(dbgs(), "before equalize BB counts", true)); - equalizeBBCounts(BF); + auto Info = DataflowInfoManager(BF, nullptr, nullptr); + equalizeBBCounts(Info, BF); LLVM_DEBUG(BF.print(dbgs(), "after equalize BB counts", true)); } if (opts::IterativeGuess) diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp --- a/bolt/lib/Passes/ShrinkWrapping.cpp +++ b/bolt/lib/Passes/ShrinkWrapping.cpp @@ -13,6 +13,8 @@ #include "bolt/Passes/ShrinkWrapping.h" #include "bolt/Core/MCPlus.h" #include "bolt/Passes/DataflowInfoManager.h" +#include "bolt/Passes/MCF.h" +#include "bolt/Utils/CommandLineOpts.h" #include #include @@ -1989,6 +1991,9 @@ if (HotOnly && (BF.getKnownExecutionCount() < BC.getHotThreshold())) return false; + if (opts::EqualizeBBCounts) + equalizeBBCounts(Info, BF); + if (BF.checkForAmbiguousJumpTables()) { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName() << ".\n"); diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp --- a/bolt/lib/Utils/CommandLineOpts.cpp +++ b/bolt/lib/Utils/CommandLineOpts.cpp @@ -76,6 +76,12 @@ cl::ZeroOrMore, cl::cat(BoltCategory)); +cl::opt EqualizeBBCounts( + "equalize-bb-counts", + cl::desc("use same count for BBs that should have equivalent count (used " + "in non-LBR and shrink wrapping)"), + cl::ZeroOrMore, cl::init(false), cl::Hidden, cl::cat(BoltOptCategory)); + cl::opt RemoveSymtab("remove-symtab", cl::desc("Remove .symtab section"), cl::init(false), cl::ZeroOrMore, cl::cat(BoltCategory)); diff --git a/bolt/test/X86/shrinkwrapping-do-not-pessimize.s b/bolt/test/X86/shrinkwrapping-do-not-pessimize.s new file mode 100644 --- /dev/null +++ b/bolt/test/X86/shrinkwrapping-do-not-pessimize.s @@ -0,0 +1,58 @@ +# This checks that shrink wrapping does not pessimize a CFG pattern where two +# blocks can be proved to have the same execution count but, because of profile +# inaccuricies, we could move saves into the second block. We can prove two +# blocks have the same frequency when B post-dominate A and A dominates B and +# are at the same loop nesting level. This would be a pessimization because +# shrink wrapping is unlikely to be able to cleanly move PUSH instructions, +# inserting additional store instructions. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \ +# RUN: %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib +# RUN: llvm-bolt -relocs %t.exe -o %t.out -data %t.fdata \ +# RUN: -frame-opt=all -equalize-bb-counts | FileCheck %s + +# Here we create a CFG pattern with two blocks A and B belonging to the same +# equivalency class as defined by dominance relations and having in theory +# the same frequency. But we tweak edge counts from profile to make block A +# hotter than block B. + .globl _start + .type _start, %function +_start: + .cfi_startproc +# Hot prologue +# FDATA: 0 [unknown] 0 1 _start 0 0 10 + push %rbp + mov %rsp, %rbp + push %rbx + push %r14 + subq $0x20, %rsp +b: je end_if_1 +# FDATA: 1 _start #b# 1 _start #end_if_1# 0 1 +if_false: + movq rel(%rip), %rdi # Add this to create a relocation and run bolt w/ relocs +c: jmp end_if_1 +# Reduce frequency from 9 to 1 to simulate an inaccurate profile +# FDATA: 1 _start #c# 1 _start #end_if_1# 0 1 +end_if_1: + # first uses of R14 and RBX appear at this point, possible move point for SW + mov %r14, %rdi + mov %rbx, %rdi + leaq -0x20(%rbp), %r14 + movq -0x20(%rbp), %rdi + addq $0x20, %rsp + pop %r14 + pop %rbx + pop %rbp + ret + .cfi_endproc + .size _start, .-_start + + .data +rel: .quad end_if_1 + +# CHECK: BOLT-INFO: Shrink wrapping moved 0 spills inserting load/stores and 0 spills inserting push/pops