diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -82,6 +82,13 @@ return OS; } +// AArch64-specific symbol markers used to delimit code/data in .text. +enum class MarkerSymType : char { + NONE = 0, + CODE, + DATA, +}; + enum class MemoryContentsType : char { UNKNOWN = 0, /// Unknown contents. POSSIBLE_JUMP_TABLE, /// Possibly a non-PIC jump table. @@ -662,6 +669,11 @@ TheTriple->getArch() == llvm::Triple::x86_64; } + // AArch64-specific functions to check if symbol is used to delimit + // code/data in .text. Code is marked by $x, data by $d. + MarkerSymType getMarkerType(const SymbolRef &Symbol) const; + bool isMarker(const SymbolRef &Symbol) const; + /// Iterate over all BinaryData. iterator_range getBinaryData() const { return make_range(BinaryDataMap.begin(), BinaryDataMap.end()); diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -1948,11 +1948,6 @@ return ColdLSDASymbol; } - /// True if the symbol is a mapping symbol used in AArch64 to delimit - /// data inside code section. - bool isDataMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const; - bool isCodeMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const; - void setOutputDataAddress(uint64_t Address) { OutputDataOffset = Address; } uint64_t getOutputDataAddress() const { return OutputDataOffset; } diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -1639,6 +1639,35 @@ } } +MarkerSymType BinaryContext::getMarkerType(const SymbolRef &Symbol) const { + // For aarch64, the ABI defines mapping symbols so we identify data in the + // code section (see IHI0056B). $x identifies a symbol starting code or the + // end of a data chunk inside code, $d indentifies start of data. + if (!isAArch64() || ELFSymbolRef(Symbol).getSize()) + return MarkerSymType::NONE; + + Expected NameOrError = Symbol.getName(); + Expected TypeOrError = Symbol.getType(); + + if (!TypeOrError || !NameOrError) + return MarkerSymType::NONE; + + if (*TypeOrError != SymbolRef::ST_Unknown) + return MarkerSymType::NONE; + + if (*NameOrError == "$x" || NameOrError->startswith("$x.")) + return MarkerSymType::CODE; + + if (*NameOrError == "$d" || NameOrError->startswith("$d.")) + return MarkerSymType::DATA; + + return MarkerSymType::NONE; +} + +bool BinaryContext::isMarker(const SymbolRef &Symbol) const { + return getMarkerType(Symbol) != MarkerSymType::NONE; +} + void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction, uint64_t Offset, const BinaryFunction *Function, diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -3926,33 +3926,6 @@ } } -bool BinaryFunction::isDataMarker(const SymbolRef &Symbol, - uint64_t SymbolSize) const { - // For aarch64, the ABI defines mapping symbols so we identify data in the - // code section (see IHI0056B). $d identifies a symbol starting data contents. - if (BC.isAArch64() && Symbol.getType() && - cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 && - Symbol.getName() && - (cantFail(Symbol.getName()) == "$d" || - cantFail(Symbol.getName()).startswith("$d."))) - return true; - return false; -} - -bool BinaryFunction::isCodeMarker(const SymbolRef &Symbol, - uint64_t SymbolSize) const { - // For aarch64, the ABI defines mapping symbols so we identify data in the - // code section (see IHI0056B). $x identifies a symbol starting code or the - // end of a data chunk inside code. - if (BC.isAArch64() && Symbol.getType() && - cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 && - Symbol.getName() && - (cantFail(Symbol.getName()) == "$x" || - cantFail(Symbol.getName()).startswith("$x."))) - return true; - return false; -} - bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol, uint64_t SymbolSize) const { // If this symbol is in a different section from the one where the @@ -3963,7 +3936,7 @@ // Some symbols are tolerated inside function bodies, others are not. // The real function boundaries may not be known at this point. - if (isDataMarker(Symbol, SymbolSize) || isCodeMarker(Symbol, SymbolSize)) + if (BC.isMarker(Symbol)) return true; // It's okay to have a zero-sized symbol in the middle of non-zero-sized diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -880,47 +880,88 @@ std::vector SortedFileSymbols; std::copy_if(InputFile->symbol_begin(), InputFile->symbol_end(), std::back_inserter(SortedFileSymbols), isSymbolInMemory); + auto CompareSymbols = [this](const SymbolRef &A, const SymbolRef &B) { + // Marker symbols have the highest precedence, while + // SECTIONs have the lowest. + auto AddressA = cantFail(A.getAddress()); + auto AddressB = cantFail(B.getAddress()); + if (AddressA != AddressB) + return AddressA < AddressB; + + bool AMarker = BC->isMarker(A); + bool BMarker = BC->isMarker(B); + if (AMarker || BMarker) { + return AMarker && !BMarker; + } - std::stable_sort( - SortedFileSymbols.begin(), SortedFileSymbols.end(), - [](const SymbolRef &A, const SymbolRef &B) { - // FUNC symbols have the highest precedence, while SECTIONs - // have the lowest. - uint64_t AddressA = cantFail(A.getAddress()); - uint64_t AddressB = cantFail(B.getAddress()); - if (AddressA != AddressB) - return AddressA < AddressB; - - SymbolRef::Type AType = cantFail(A.getType()); - SymbolRef::Type BType = cantFail(B.getType()); - if (AType == SymbolRef::ST_Function && BType != SymbolRef::ST_Function) - return true; - if (BType == SymbolRef::ST_Debug && AType != SymbolRef::ST_Debug) - return true; + auto AType = cantFail(A.getType()); + auto BType = cantFail(B.getType()); + if (AType == SymbolRef::ST_Function && BType != SymbolRef::ST_Function) + return true; + if (BType == SymbolRef::ST_Debug && AType != SymbolRef::ST_Debug) + return true; - return false; - }); + return false; + }; + + std::stable_sort(SortedFileSymbols.begin(), SortedFileSymbols.end(), + CompareSymbols); + + auto LastSymbol = SortedFileSymbols.end() - 1; // For aarch64, the ABI defines mapping symbols so we identify data in the // code section (see IHI0056B). $d identifies data contents. - auto LastSymbol = SortedFileSymbols.end() - 1; + // Compilers usually merge multiple data objects in a single $d-$x interval, + // but we need every data object to be marked with $d. Because of that we + // create a vector of MarkerSyms with all locations of data objects. + + struct MarkerSym { + uint64_t Address; + MarkerSymType Type; + }; + + std::vector SortedMarkerSymbols; + auto addExtraDataMarkerPerSymbol = + [this](const std::vector &SortedFileSymbols, + std::vector &SortedMarkerSymbols) { + bool IsData = false; + uint64_t LastAddr = 0; + for (auto Sym = SortedFileSymbols.begin(); + Sym < SortedFileSymbols.end(); ++Sym) { + uint64_t Address = cantFail(Sym->getAddress()); + if (LastAddr == Address) // don't repeat markers + continue; + + MarkerSymType MarkerType = BC->getMarkerType(*Sym); + if (MarkerType != MarkerSymType::NONE) { + SortedMarkerSymbols.push_back(MarkerSym{Address, MarkerType}); + LastAddr = Address; + IsData = MarkerType == MarkerSymType::DATA; + continue; + } + + if (IsData) { + SortedMarkerSymbols.push_back( + MarkerSym{cantFail(Sym->getAddress()), MarkerSymType::DATA}); + LastAddr = Address; + } + } + }; + if (BC->isAArch64()) { + addExtraDataMarkerPerSymbol(SortedFileSymbols, SortedMarkerSymbols); LastSymbol = std::stable_partition( SortedFileSymbols.begin(), SortedFileSymbols.end(), - [](const SymbolRef &Symbol) { - StringRef Name = cantFail(Symbol.getName()); - return !(cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && - (Name == "$d" || Name.startswith("$d.") || Name == "$x" || - Name.startswith("$x."))); - }); + [this](const SymbolRef &Symbol) { return !BC->isMarker(Symbol); }); --LastSymbol; } BinaryFunction *PreviousFunction = nullptr; unsigned AnonymousId = 0; - const auto MarkersBegin = std::next(LastSymbol); - for (auto ISym = SortedFileSymbols.begin(); ISym != MarkersBegin; ++ISym) { + const auto SortedSymbolsEnd = std::next(LastSymbol); + for (auto ISym = SortedFileSymbols.begin(); ISym != SortedSymbolsEnd; + ++ISym) { const SymbolRef &Symbol = *ISym; // Keep undefined symbols for pretty printing? if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Undefined) @@ -1213,25 +1254,24 @@ adjustFunctionBoundaries(); // Annotate functions with code/data markers in AArch64 - for (auto ISym = MarkersBegin; ISym != SortedFileSymbols.end(); ++ISym) { - const SymbolRef &Symbol = *ISym; - uint64_t Address = - cantFail(Symbol.getAddress(), "cannot get symbol address"); - uint64_t SymbolSize = ELFSymbolRef(Symbol).getSize(); - BinaryFunction *BF = - BC->getBinaryFunctionContainingAddress(Address, true, true); + for (auto ISym = SortedMarkerSymbols.begin(); + ISym != SortedMarkerSymbols.end(); ++ISym) { + + auto *BF = + BC->getBinaryFunctionContainingAddress(ISym->Address, true, true); + if (!BF) { // Stray marker continue; } - const uint64_t EntryOffset = Address - BF->getAddress(); - if (BF->isCodeMarker(Symbol, SymbolSize)) { + const auto EntryOffset = ISym->Address - BF->getAddress(); + if (ISym->Type == MarkerSymType::CODE) { BF->markCodeAtOffset(EntryOffset); continue; } - if (BF->isDataMarker(Symbol, SymbolSize)) { + if (ISym->Type == MarkerSymType::DATA) { BF->markDataAtOffset(EntryOffset); - BC->AddressToConstantIslandMap[Address] = BF; + BC->AddressToConstantIslandMap[ISym->Address] = BF; continue; } llvm_unreachable("Unknown marker"); diff --git a/bolt/test/AArch64/Inputs/unmarked-data.yaml b/bolt/test/AArch64/Inputs/unmarked-data.yaml new file mode 100644 --- /dev/null +++ b/bolt/test/AArch64/Inputs/unmarked-data.yaml @@ -0,0 +1,90 @@ +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 + Entry: 0x210134 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x200040 + Align: 0x8 + FileSize: 0x0000e0 + MemSize: 0x0000e0 + Offset: 0x000040 + - Type: PT_LOAD + Flags: [ PF_R ] + VAddr: 0x200000 + Align: 0x10000 + FileSize: 0x000120 + MemSize: 0x000120 + Offset: 0x000000 + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + FirstSec: .text + LastSec: .text + VAddr: 0x210120 + Align: 0x10000 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Align: 0x0 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x210120 + AddressAlign: 0x4 + Content: 030F0B0700000000030F0B0700000000C0035FD6FFFFFF97000080D2A80B8052010000D4 + - Name: .rela.text + Type: SHT_RELA + Flags: [ SHF_INFO_LINK ] + Link: .symtab + AddressAlign: 0x8 + Info: .text + Relocations: + - Offset: 0x210134 + Symbol: dummy + Type: R_AARCH64_CALL26 + - Name: .comment + Type: SHT_PROGBITS + Flags: [ SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + EntSize: 0x1 + Content: 4C696E6B65723A204C4C442031352E302E3000 +Symbols: + - Name: val + Index: SHN_ABS + Value: 0x70B0F03 + - Name: first + Section: .text + Value: 0x210120 + Size: 0x8 + - Name: '$d.0' + Section: .text + Value: 0x210120 + - Name: second + Section: .text + Value: 0x210128 + Size: 0x8 + - Name: '$x.1' + Section: .text + Value: 0x210130 + - Name: .text + Type: STT_SECTION + Section: .text + Value: 0x210120 + - Name: .comment + Type: STT_SECTION + Section: .comment + - Name: dummy + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x210130 + - Name: _start + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x210134 +... diff --git a/bolt/test/AArch64/unmarked-data.test b/bolt/test/AArch64/unmarked-data.test new file mode 100644 --- /dev/null +++ b/bolt/test/AArch64/unmarked-data.test @@ -0,0 +1,34 @@ +// This test checks that multiple data objects in text of which only first is marked get disassembled properly + +// RUN: yaml2obj %S/Inputs/unmarked-data.yaml -o %t.exe +// RUN: llvm-bolt %t.exe -o %t.bolt -lite=0 -use-old-text=0 2>&1 | FileCheck %s +// CHECK-NOT: BOLT-WARNING +// RUN: llvm-objdump -j .text -d --disassemble-symbols=first,second %t.bolt | FileCheck %s -check-prefix=CHECK-SYMBOL +// CHECK-SYMBOL: : +// CHECK-SYMBOL: : + +// YAML is based in the following assembly: + + .equ val, 0x070b0f03 // we use constant that is not a valid instruction so that it can't be silently dissassembled + .text + +first: + .xword val + .size first, .-first + +second: + .xword val + .size second, .-second + + .globl dummy + .type dummy, %function +dummy: // dummy function to force relocations + ret + + .globl _start + .type _start, %function +_start: + bl dummy + mov x0, #0 + mov w8, #93 + svc #0