diff --git a/bolt/include/bolt/Core/HashUtilities.h b/bolt/include/bolt/Core/HashUtilities.h --- a/bolt/include/bolt/Core/HashUtilities.h +++ b/bolt/include/bolt/Core/HashUtilities.h @@ -20,8 +20,6 @@ namespace llvm { namespace bolt { -uint16_t hash_64_to_16(const uint64_t Hash); - std::string hashInteger(uint64_t Value); std::string hashSymbol(BinaryContext &BC, const MCSymbol &Symbol); @@ -35,6 +33,8 @@ std::string hashBlock(BinaryContext &BC, const BinaryBasicBlock &BB, OperandHashFuncTy OperandHashFunc); +std::string hashBlockLoose(BinaryContext &BC, const BinaryBasicBlock &BB); + } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Core/HashUtilities.cpp b/bolt/lib/Core/HashUtilities.cpp --- a/bolt/lib/Core/HashUtilities.cpp +++ b/bolt/lib/Core/HashUtilities.cpp @@ -130,5 +130,42 @@ return HashString; } +/// A "loose" hash of a basic block to use with the stale profile matching. The +/// computed value will be the same for blocks with minor changes (such as +/// reordering of instructions or using different operands) but may result in +/// collisions that need to be resolved by a stronger hashing. +std::string hashBlockLoose(BinaryContext &BC, const BinaryBasicBlock &BB) { + // The hash is computed by creating a string of all lexicographically ordered + // instruction opcodes, which is then hashed with std::hash. + std::set Opcodes; + for (const MCInst &Inst : BB) { + if (BC.MIB->isPseudo(Inst)) + continue; + + // Ignore unconditional jumps, as they can be added / removed as a result + // of basic block reordering. + if (BC.MIB->isUnconditionalBranch(Inst)) + continue; + + // Do not distinguish different types of conditional jumps. + if (BC.MIB->isConditionalBranch(Inst)) { + Opcodes.insert("JMP"); + continue; + } + + std::string Mnemonic = BC.InstPrinter->getMnemonic(&Inst).first; + Mnemonic.erase( + std::remove_if(Mnemonic.begin(), Mnemonic.end(), + [](unsigned char ch) { return std::isspace(ch); }), + Mnemonic.end()); + Opcodes.insert(Mnemonic); + } + + std::string HashString; + for (const std::string &Opcode : Opcodes) + HashString.append(Opcode); + return HashString; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -73,64 +73,39 @@ cl::opt StaleMatchingCostBlockInc( "stale-matching-cost-block-inc", - cl::desc("The cost of increasing a block's count by one."), cl::init(110), + cl::desc("The cost of increasing a block count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); cl::opt StaleMatchingCostBlockDec( "stale-matching-cost-block-dec", - cl::desc("The cost of decreasing a block's count by one."), cl::init(100), + cl::desc("The cost of decreasing a block count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -cl::opt StaleMatchingCostBlockEntryInc( - "stale-matching-cost-block-entry-inc", - cl::desc("The cost of increasing the entry block's count by one."), - cl::init(110), cl::ReallyHidden, cl::cat(BoltOptCategory)); - -cl::opt StaleMatchingCostBlockEntryDec( - "stale-matching-cost-block-entry-dec", - cl::desc("The cost of decreasing the entry block's count by one."), - cl::init(100), cl::ReallyHidden, cl::cat(BoltOptCategory)); - -cl::opt StaleMatchingCostBlockZeroInc( - "stale-matching-cost-block-zero-inc", - cl::desc("The cost of increasing a count of zero-weight block by one."), - cl::init(10), cl::Hidden, cl::cat(BoltOptCategory)); - -cl::opt StaleMatchingCostBlockUnknownInc( - "stale-matching-cost-block-unknown-inc", - cl::desc("The cost of increasing an unknown block's count by one."), - cl::init(10), cl::ReallyHidden, cl::cat(BoltOptCategory)); - cl::opt StaleMatchingCostJumpInc( "stale-matching-cost-jump-inc", - cl::desc("The cost of increasing a jump's count by one."), cl::init(100), + cl::desc("The cost of increasing a jump count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -cl::opt StaleMatchingCostJumpFTInc( - "stale-matching-cost-jump-ft-inc", - cl::desc("The cost of increasing a fall-through jump's count by one."), - cl::init(100), cl::ReallyHidden, cl::cat(BoltOptCategory)); - cl::opt StaleMatchingCostJumpDec( "stale-matching-cost-jump-dec", - cl::desc("The cost of decreasing a jump's count by one."), cl::init(110), + cl::desc("The cost of decreasing a jump count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -cl::opt StaleMatchingCostJumpFTDec( - "stale-matching-cost-jump-ft-dec", - cl::desc("The cost of decreasing a fall-through jump's count by one."), - cl::init(110), cl::ReallyHidden, cl::cat(BoltOptCategory)); +cl::opt StaleMatchingCostBlockUnknownInc( + "stale-matching-cost-block-unknown-inc", + cl::desc("The cost of increasing an unknown block count by one."), + cl::init(1), cl::ReallyHidden, cl::cat(BoltOptCategory)); cl::opt StaleMatchingCostJumpUnknownInc( "stale-matching-cost-jump-unknown-inc", - cl::desc("The cost of increasing an unknown jump's count by one."), - cl::init(50), cl::ReallyHidden, cl::cat(BoltOptCategory)); + cl::desc("The cost of increasing an unknown jump count by one."), + cl::init(140), cl::ReallyHidden, cl::cat(BoltOptCategory)); cl::opt StaleMatchingCostJumpUnknownFTInc( "stale-matching-cost-jump-unknown-ft-inc", cl::desc( - "The cost of increasing an unknown fall-through jump's count by one."), - cl::init(5), cl::ReallyHidden, cl::cat(BoltOptCategory)); + "The cost of increasing an unknown fall-through jump count by one."), + cl::init(3), cl::ReallyHidden, cl::cat(BoltOptCategory)); } // namespace opts @@ -145,7 +120,8 @@ using ValueOffset = Bitfield::Element; using ValueOpcode = Bitfield::Element; using ValueInstr = Bitfield::Element; - using ValueNeighbor = Bitfield::Element; + using ValuePred = Bitfield::Element; + using ValueSucc = Bitfield::Element; public: explicit BlendedBlockHash() {} @@ -154,7 +130,8 @@ Offset = Bitfield::get(Hash); OpcodeHash = Bitfield::get(Hash); InstrHash = Bitfield::get(Hash); - NeighborHash = Bitfield::get(Hash); + PredHash = Bitfield::get(Hash); + SuccHash = Bitfield::get(Hash); } /// Combine the blended hash into uint64_t. @@ -163,7 +140,8 @@ Bitfield::set(Hash, Offset); Bitfield::set(Hash, OpcodeHash); Bitfield::set(Hash, InstrHash); - Bitfield::set(Hash, NeighborHash); + Bitfield::set(Hash, PredHash); + Bitfield::set(Hash, SuccHash); return Hash; } @@ -175,7 +153,8 @@ "incorrect blended hash distance computation"); uint64_t Dist = 0; // Account for NeighborHash - Dist += NeighborHash == BBH.NeighborHash ? 0 : 1; + Dist += SuccHash == BBH.SuccHash ? 0 : 1; + Dist += PredHash == BBH.PredHash ? 0 : 1; Dist <<= 16; // Account for InstrHash Dist += InstrHash == BBH.InstrHash ? 0 : 1; @@ -192,9 +171,10 @@ /// (Strong) Hash of the basic block instructions, including opcodes and /// operands. uint16_t InstrHash{0}; - /// Hash of the (loose) basic block together with (loose) hashes of its - /// successors and predecessors. - uint16_t NeighborHash{0}; + /// (Loose) Hashes of the predecessors of the basic block. + uint8_t PredHash{0}; + /// (Loose) Hashes of the successors of the basic block. + uint8_t SuccHash{0}; }; /// The object is used to identify and match basic blocks in a BinaryFunction @@ -252,41 +232,43 @@ std::vector BlendedHashes(BasicBlocks.size()); std::vector OpcodeHashes(BasicBlocks.size()); - // Initialize hash components + // Initialize hash components. for (size_t I = 0; I < BasicBlocks.size(); I++) { const BinaryBasicBlock *BB = BasicBlocks[I]; assert(BB->getIndex() == I && "incorrect block index"); BlendedHashes[I].Offset = BB->getOffset(); - // Hashing complete instructions + // Hashing complete instructions. std::string InstrHashStr = hashBlock( BC, *BB, [&](const MCOperand &Op) { return hashInstOperand(BC, Op); }); uint64_t InstrHash = std::hash{}(InstrHashStr); - BlendedHashes[I].InstrHash = hash_64_to_16(InstrHash); - // Hashing opcodes - std::string OpcodeHashStr = - hashBlock(BC, *BB, [](const MCOperand &Op) { return std::string(); }); + BlendedHashes[I].InstrHash = (uint16_t)hash_value(InstrHash); + // Hashing opcodes. + std::string OpcodeHashStr = hashBlockLoose(BC, *BB); OpcodeHashes[I] = std::hash{}(OpcodeHashStr); - BlendedHashes[I].OpcodeHash = hash_64_to_16(OpcodeHashes[I]); + BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(OpcodeHashes[I]); } - // Initialize neighbor hash + // Initialize neighbor hash. for (size_t I = 0; I < BasicBlocks.size(); I++) { const BinaryBasicBlock *BB = BasicBlocks[I]; - uint64_t Hash = OpcodeHashes[I]; - // Append hashes of successors + // Append hashes of successors. + uint64_t Hash = 0; for (BinaryBasicBlock *SuccBB : BB->successors()) { uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()]; Hash = hashing::detail::hash_16_bytes(Hash, SuccHash); } - // Append hashes of predecessors + BlendedHashes[I].SuccHash = (uint8_t)hash_value(Hash); + + // Append hashes of predecessors. + Hash = 0; for (BinaryBasicBlock *PredBB : BB->predecessors()) { uint64_t PredHash = OpcodeHashes[PredBB->getIndex()]; Hash = hashing::detail::hash_16_bytes(Hash, PredHash); } - BlendedHashes[I].NeighborHash = hash_64_to_16(Hash); + BlendedHashes[I].PredHash = (uint8_t)hash_value(Hash); } - // Assign hashes + // Assign hashes. for (size_t I = 0; I < BasicBlocks.size(); I++) { const BinaryBasicBlock *BB = BasicBlocks[I]; BB->setHash(BlendedHashes[I].combine()); @@ -409,20 +391,22 @@ const FlowBlock *MatchedBlock = Matcher.matchBlock(YamlHash); if (MatchedBlock != nullptr) { MatchedBlocks[YamlBB.Index] = MatchedBlock; - LLVM_DEBUG(dbgs() << "Matched yaml block with bid = " << YamlBB.Index - << " and hash = " << Twine::utohexstr(YamlBB.Hash) - << " to BB with index = " << MatchedBlock->Index - 1 + BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1]; + LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBB.Index << ")" + << " with hash " << Twine::utohexstr(YamlBB.Hash) + << " to BB (index = " << MatchedBlock->Index - 1 << ")" + << " with hash " << Twine::utohexstr(BinHash.combine()) << "\n"); // Update matching stats accounting for the matched block. - BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1]; if (Matcher.isHighConfidenceMatch(BinHash, YamlHash)) { ++BC.Stats.NumMatchedBlocks; BC.Stats.MatchedSampleCount += YamlBB.ExecCount; + LLVM_DEBUG(dbgs() << " exact match\n"); } } else { LLVM_DEBUG( - dbgs() << "Couldn't match yaml block with bid = " << YamlBB.Index - << " and hash = " << Twine::utohexstr(YamlBB.Hash) << "\n"); + dbgs() << "Couldn't match yaml block (bid = " << YamlBB.Index << ")" + << " with hash " << Twine::utohexstr(YamlBB.Hash) << "\n"); } // Update matching stats. @@ -575,16 +559,15 @@ Params.JoinIslands = opts::StaleMatchingJoinIslands; Params.CostBlockInc = opts::StaleMatchingCostBlockInc; + Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc; Params.CostBlockDec = opts::StaleMatchingCostBlockDec; - Params.CostBlockEntryInc = opts::StaleMatchingCostBlockEntryInc; - Params.CostBlockEntryDec = opts::StaleMatchingCostBlockEntryDec; - Params.CostBlockZeroInc = opts::StaleMatchingCostBlockZeroInc; + Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec; Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc; Params.CostJumpInc = opts::StaleMatchingCostJumpInc; - Params.CostJumpFTInc = opts::StaleMatchingCostJumpFTInc; + Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc; Params.CostJumpDec = opts::StaleMatchingCostJumpDec; - Params.CostJumpFTDec = opts::StaleMatchingCostJumpFTDec; + Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec; Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc; Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc; diff --git a/bolt/test/X86/Inputs/blarge_profile_stale.yaml b/bolt/test/X86/Inputs/blarge_profile_stale.yaml --- a/bolt/test/X86/Inputs/blarge_profile_stale.yaml +++ b/bolt/test/X86/Inputs/blarge_profile_stale.yaml @@ -6,6 +6,7 @@ profile-flags: [ lbr ] profile-origin: branch profile reader profile-events: '' + dfs-order: false functions: - name: SolveCubic fid: 6 @@ -15,20 +16,24 @@ blocks: - bid: 0 insns: 43 - hash: 0xD2411AC186118199 + hash: 0xed4db287e71c0000 exec: 151 - succ: [ { bid: 1, cnt: 4, mis: 2 }, { bid: 11, cnt: 0 } ] + succ: [ { bid: 1, cnt: 151, mis: 2 }, { bid: 7, cnt: 0 } ] - bid: 1 insns: 7 - hash: 0xDF0C9CC1FEAA70C3 - succ: [ { bid: 10, cnt: 0 }, { bid: 2, cnt: 0 } ] + hash: 0x39330000e4560088 + succ: [ { bid: 13, cnt: 151 }, { bid: 2, cnt: 0 } ] - bid: 13 insns: 26 - hash: 0xF05DC5524E99E56F - succ: [ { bid: 15, cnt: 89 }, { bid: 14, cnt: 0 } ] - - bid: 15 + hash: 0xa9700000fe202a7 + succ: [ { bid: 3, cnt: 89 }, { bid: 2, cnt: 10 } ] + - bid: 3 + insns: 9 + hash: 0x62391dad18a700a0 + succ: [ { bid: 5, cnt: 151 } ] + - bid: 5 insns: 9 - hash: 0xB2E8338276A9834E + hash: 0x4d906d19ecec0111 - name: usqrt fid: 7 hash: 0x8B62B1F9AD81EA35 @@ -37,15 +42,15 @@ blocks: - bid: 0 insns: 4 - hash: 0xb1e5b76571270000 + hash: 0x1111111111111111 exec: 20 succ: [ { bid: 1, cnt: 0 } ] - bid: 1 insns: 9 - hash: 0x587e93788b970010 + hash: 0x27e43a5e10cd0010 succ: [ { bid: 3, cnt: 320, mis: 171 }, { bid: 2, cnt: 0 } ] - bid: 3 insns: 2 - hash: 0x20e605d745e50039 + hash: 0x4db935b6471e0039 succ: [ { bid: 1, cnt: 300, mis: 33 }, { bid: 4, cnt: 20 } ] ... diff --git a/bolt/test/X86/reader-stale-yaml.test b/bolt/test/X86/reader-stale-yaml.test --- a/bolt/test/X86/reader-stale-yaml.test +++ b/bolt/test/X86/reader-stale-yaml.test @@ -1,39 +1,71 @@ # This script checks that YamlProfileReader in llvm-bolt is reading data -# correctly and stale data is corrected. +# correctly and stale data is corrected by profile inference. RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe +# Testing "usqrt" RUN: llvm-bolt %t.exe -o /dev/null --b %p/Inputs/blarge_profile_stale.yaml \ RUN: --print-cfg --print-only=usqrt --infer-stale-profile=1 \ -RUN: --profile-ignore-hash=1 --profile-use-dfs 2>&1 | FileCheck %s +RUN: --profile-ignore-hash=1 --profile-use-dfs=0 2>&1 | FileCheck %s -check-prefix=CHECK1 +# Testing "SolveCubic" +RUN: llvm-bolt %t.exe -o /dev/null --b %p/Inputs/blarge_profile_stale.yaml \ +RUN: --print-cfg --print-only=SolveCubic --infer-stale-profile=1 \ +RUN: --profile-ignore-hash=1 --profile-use-dfs=0 2>&1 | FileCheck %s -check-prefix=CHECK2 + +# Function "usqrt" has stale profile, since the number of blocks in the profile +# (nblocks=6) does not match the size of the CFG in the binary. The entry +# block (bid=0) has an incorrect (missing) count, which should be inferred by +# the algorithm. # Verify that yaml reader works as expected. -CHECK: pre-processing profile using YAML profile reader +CHECK1: pre-processing profile using YAML profile reader +CHECK1: Binary Function "usqrt" after building cfg { +CHECK1: State : CFG constructed +CHECK1: Address : 0x401170 +CHECK1: Size : 0x43 +CHECK1: Section : .text +CHECK1: IsSimple : 1 +CHECK1: BB Count : 5 +CHECK1: Exec Count : 20 +CHECK1: Branch Count: 640 +CHECK1: } +# Verify block counts. +CHECK1: .LBB01 (4 instructions, align : 1) +CHECK1: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 20) +CHECK1: .Ltmp[[#BB13:]] (9 instructions, align : 1) +CHECK1: Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 320), .LFT[[#BB0:]] (mispreds: 0, count: 0) +CHECK1: .LFT[[#BB0:]] (2 instructions, align : 1) +CHECK1: Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 0) +CHECK1: .Ltmp[[#BB12:]] (2 instructions, align : 1) +CHECK1: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 300), .LFT[[#BB1:]] (mispreds: 0, count: 20) +CHECK1: .LFT[[#BB1:]] (2 instructions, align : 1) +# Check the overall inference stats. +CHECK1: 2 out of 7 functions in the binary (28.6%) have non-empty execution profile +CHECK1: inferred profile for 2 (100.00% of profiled, 100.00% of stale) functions responsible for {{.*}} samples ({{.*}} out of {{.*}}) -# Verify the inferred counts of "usqrt" that has stale profile: -# - the function has nblocks=6 in the profile, which makes it stale -# - block with bid=0 has an incorrect (missing) count, which is inferred -CHECK: Binary Function "usqrt" after building cfg { -CHECK: State : CFG constructed -CHECK: Address : 0x401170 -CHECK: Size : 0x43 -CHECK: Section : .text -CHECK: IsSimple : 1 -CHECK: BB Count : 5 -CHECK: Exec Count : 20 -CHECK: Branch Count: 640 -CHECK: } -# Verify block counts. -CHECK: .LBB01 (4 instructions, align : 1) -CHECK: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 20) -CHECK: .Ltmp[[#BB13:]] (9 instructions, align : 1) -CHECK: Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 320), .LFT[[#BB0:]] (mispreds: 0, count: 0) -CHECK: .LFT[[#BB0:]] (2 instructions, align : 1) -CHECK: Successors: .Ltmp[[#BB12:]] (mispreds: 0, count: 0) -CHECK: .Ltmp[[#BB12:]] (2 instructions, align : 1) -CHECK: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 300), .LFT[[#BB1:]] (mispreds: 0, count: 20) -CHECK: .LFT[[#BB1:]] (2 instructions, align : 1) +# Function "SolveCubic" has stale profile, since there is one jump in the +# profile (from bid=13 to bid=2) which is not in the CFG in the binary. The test +# verifies that the inference is able to match two blocks (bid=1 and bid=13) +# using "loose" hashes and then correctly propagate the counts. -# Check the overal inference stats. -CHECK: 2 out of 7 functions in the binary (28.6%) have non-empty execution profile -CHECK: inferred profile for 1 (50.00% of profiled, 100.00% of stale) functions responsible for 87.31% samples (640 out of 733) +CHECK2: pre-processing profile using YAML profile reader +CHECK2: Binary Function "SolveCubic" after building cfg { +CHECK2: State : CFG constructed +CHECK2: Address : 0x400e00 +CHECK2: Size : 0x368 +CHECK2: Section : .text +CHECK2: IsSimple : 1 +CHECK2: BB Count : 18 +CHECK2: Exec Count : 151 +CHECK2: Branch Count: 552 +# Verify block counts. +CHECK2: .LBB00 (43 instructions, align : 1) +CHECK2: Successors: .Ltmp[[#BB7:]] (mispreds: 0, count: 0), .LFT[[#BB1:]] (mispreds: 0, count: 151) +CHECK2: .LFT[[#BB1:]] (5 instructions, align : 1) +CHECK2: Successors: .Ltmp[[#BB13:]] (mispreds: 0, count: 151), .LFT[[#BB2:]] (mispreds: 0, count: 0) +CHECK2: .Ltmp[[#BB3:]] (26 instructions, align : 1) +CHECK2: Successors: .Ltmp[[#BB5:]] (mispreds: 0, count: 151), .LFT[[#BB4:]] (mispreds: 0, count: 0) +CHECK2: .Ltmp[[#BB5:]] (9 instructions, align : 1) +CHECK2: .Ltmp[[#BB13:]] (12 instructions, align : 1) +CHECK2: Successors: .Ltmp[[#BB3:]] (mispreds: 0, count: 151) +CHECK2: 2 out of 7 functions in the binary (28.6%) have non-empty execution profile