diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -366,14 +366,15 @@ std::string ColdCodeSectionName; /// Parent function fragment for split function fragments. - SmallPtrSet ParentFragments; + using FragmentsSetTy = SmallPtrSet; + FragmentsSetTy ParentFragments; /// Indicate if the function body was folded into another function. /// Used by ICF optimization. BinaryFunction *FoldedIntoFunction{nullptr}; /// All fragments for a parent function. - SmallPtrSet Fragments; + FragmentsSetTy Fragments; /// The profile data for the number of times the function was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; @@ -1779,6 +1780,15 @@ return llvm::is_contained(Fragments, &Other); } + /// Return the child fragment form parent function + iterator_range getFragments() const { + return iterator_range(Fragments.begin(), + Fragments.end()); + } + + /// Return the parent function for split function fragments. + FragmentsSetTy *getParentFragments() { return &ParentFragments; } + /// Returns if this function is a parent or child of \p Other function. bool isParentOrChildOf(const BinaryFunction &Other) const { return isChildOf(Other) || isParentOf(Other); diff --git a/bolt/lib/Passes/RegReAssign.cpp b/bolt/lib/Passes/RegReAssign.cpp --- a/bolt/lib/Passes/RegReAssign.cpp +++ b/bolt/lib/Passes/RegReAssign.cpp @@ -140,7 +140,7 @@ std::fill(RegScore.begin(), RegScore.end(), 0); std::fill(RankedRegs.begin(), RankedRegs.end(), 0); - for (BinaryBasicBlock &BB : Function) { + auto countRegScore = [&](BinaryBasicBlock &BB) { for (MCInst &Inst : BB) { const bool CannotUseREX = BC.MIB->cannotUseREX(Inst); const MCInstrDesc &Desc = BC.MII->get(Inst.getOpcode()); @@ -191,7 +191,15 @@ RegScore[RegEC] += BB.getKnownExecutionCount(); } } + }; + for (BinaryBasicBlock &BB : Function) + countRegScore(BB); + + for (BinaryFunction *ChildFrag : Function.getFragments()) { + for (BinaryBasicBlock &BB : *ChildFrag) + countRegScore(BB); } + std::iota(RankedRegs.begin(), RankedRegs.end(), 0); // 0, 1, 2, 3... llvm::sort(RankedRegs, [&](size_t A, size_t B) { return RegScore[A] > RegScore[B]; }); @@ -213,6 +221,17 @@ BinaryContext &BC = Function.getBinaryContext(); rankRegisters(Function); + // If there is a situation where function: + // A() -> A.cold() + // A.localalias() -> A.cold() + // simply swapping these two calls can cause issues. + for (BinaryFunction *ChildFrag : Function.getFragments()) { + if (ChildFrag->getParentFragments()->size() > 1) + return; + if (ChildFrag->empty()) + return; + } + // Bail early if our registers are all black listed, before running expensive // analysis passes bool Bail = true; @@ -304,6 +323,10 @@ << " with " << BC.MRI->getName(ExtReg) << "\n\n"); swap(Function, ClassicReg, ExtReg); FuncsChanged.insert(&Function); + for (BinaryFunction *ChildFrag : Function.getFragments()) { + swap(*ChildFrag, ClassicReg, ExtReg); + FuncsChanged.insert(ChildFrag); + } ++Begin; if (Begin == End) break; @@ -315,6 +338,13 @@ BinaryContext &BC = Function.getBinaryContext(); rankRegisters(Function); + for (BinaryFunction *ChildFrag : Function.getFragments()) { + if (ChildFrag->getParentFragments()->size() > 1) + return false; + if (ChildFrag->empty()) + return false; + } + // Try swapping R12, R13, R14 or R15 with RBX (we work with all callee-saved // regs except RBP) MCPhysReg Candidate = 0; @@ -345,6 +375,10 @@ (void)BC; swap(Function, RBX, Candidate); FuncsChanged.insert(&Function); + for (BinaryFunction *ChildFrag : Function.getFragments()) { + swap(*ChildFrag, RBX, Candidate); + FuncsChanged.insert(ChildFrag); + } return true; } @@ -404,7 +438,7 @@ for (auto &I : BC.getBinaryFunctions()) { BinaryFunction &Function = I.second; - if (!Function.isSimple() || Function.isIgnored()) + if (!Function.isSimple() || Function.isIgnored() || Function.isFragment()) continue; LLVM_DEBUG(dbgs() << "====================================\n"); diff --git a/bolt/test/runtime/X86/reg-reassign-swap-cold.s b/bolt/test/runtime/X86/reg-reassign-swap-cold.s new file mode 100644 --- /dev/null +++ b/bolt/test/runtime/X86/reg-reassign-swap-cold.s @@ -0,0 +1,64 @@ +# This test case reproduces a bug where, during register swapping, +# the code fragments associated with the function need to be swapped +# together (which may be generated during PGO optimization). If not +# handled properly, optimized binary execution can result in a segmentation fault. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang -no-pie %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.out -data=%t.fdata --reg-reassign | FileCheck %s +# RUN: %t.out + +# CHECK: BOLT-INFO: Reg Reassignment Pass Stats +# CHECK-NEXT: 2 functions affected. + .text + .globl main + .globl main.cold + .p2align 4, 0x90 + .type main,@function + .type main.cold,@function +main.cold: +bb1: + cmp $0x3, %r12 + jne bb8 +bb2: + jmp bb4 +main: # @main + .cfi_startproc +# %bb.0: # %entry + pushq %rax + pushq %r12 + pushq %rbx + .cfi_def_cfa_offset 16 + mov $0x1, %r12 + mov $0x2, %rbx + add $0x1, %r12 + shr $0x14, %r12 + mov $0x3, %r12 +bb3: + jmp bb1 +bb4: + cmp $0x3, %r12 +bb5: + jne bb8 +bb6: + xorl %eax, %eax +bb7: + popq %rcx + popq %rbx + popq %r12 + .cfi_def_cfa_offset 8 + retq +bb8: + mov $0x1, %rax + jmp bb7 +# FDATA: 1 main.cold #bb2# 1 main #bb4# 0 100 +# FDATA: 1 main #bb5# 1 main #bb6# 0 100 +# FDATA: 1 main #bb3# 1 main.cold 0 0 100 + +.Lfunc_end0: + .size main, .Lfunc_end0-main + .cfi_endproc