diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -82,6 +82,12 @@ return OS; } +enum class MarkerSymType : char { + NONE = 0, + CODE, + DATA, +}; + enum class MemoryContentsType : char { UNKNOWN = 0, /// Unknown contents. POSSIBLE_JUMP_TABLE, /// Possibly a non-PIC jump table. @@ -662,6 +668,13 @@ TheTriple->getArch() == llvm::Triple::x86_64; } + // AArch64-specific functions to check if symbol is used to delimit + // code/data in .text. Code is marked by $x, data by $d. + MarkerSymType getMarkerType(const SymbolRef &Symbol) const; + bool isMarker(const SymbolRef &Symbol) const; + bool isDataMarker(const SymbolRef &Symbol) const; + bool isCodeMarker(const SymbolRef &Symbol) const; + /// Iterate over all BinaryData. iterator_range getBinaryData() const { return make_range(BinaryDataMap.begin(), BinaryDataMap.end()); diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -658,6 +658,15 @@ Instructions.emplace(Offset, std::forward(Instruction)); } + /// Return offset for first instruction. If at the beginning of a function + /// there is data then offset of the first instruction could be different + /// from 0 + uint64_t getFirstInstructionOffset() const { + if (Instructions.empty()) + return 0; + return Instructions.begin()->first; + } + /// Convert CFI instructions to a standard form (remove remember/restore). void normalizeCFIState(); @@ -1951,11 +1960,6 @@ return ColdLSDASymbol; } - /// True if the symbol is a mapping symbol used in AArch64 to delimit - /// data inside code section. - bool isDataMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const; - bool isCodeMarker(const SymbolRef &Symbol, uint64_t SymbolSize) const; - void setOutputDataAddress(uint64_t Address) { OutputDataOffset = Address; } uint64_t getOutputDataAddress() const { return OutputDataOffset; } diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -1637,6 +1637,43 @@ } } +MarkerSymType BinaryContext::getMarkerType(const SymbolRef &Symbol) const { + // For aarch64, the ABI defines mapping symbols so we identify data in the + // code section (see IHI0056B). $x identifies a symbol starting code or the + // end of a data chunk inside code, $d indentifies start of data. + if (!isAArch64() || ELFSymbolRef(Symbol).getSize()) + return MarkerSymType::NONE; + + llvm::Expected NameOrError = Symbol.getName(); + llvm::Expected TypeOrError = Symbol.getType(); + + if (!TypeOrError || !NameOrError) + return MarkerSymType::NONE; + + if (*TypeOrError != SymbolRef::ST_Unknown) + return MarkerSymType::NONE; + + if (*NameOrError == "$x" || NameOrError->startswith("$x.")) + return MarkerSymType::CODE; + + if (*NameOrError == "$d" || NameOrError->startswith("$d.")) + return MarkerSymType::DATA; + + return MarkerSymType::NONE; +} + +bool BinaryContext::isMarker(const SymbolRef &Symbol) const { + return getMarkerType(Symbol) != MarkerSymType::NONE; +} + +bool BinaryContext::isCodeMarker(const SymbolRef &Symbol) const { + return getMarkerType(Symbol) == MarkerSymType::CODE; +} + +bool BinaryContext::isDataMarker(const SymbolRef &Symbol) const { + return getMarkerType(Symbol) == MarkerSymType::DATA; +} + void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction, uint64_t Offset, const BinaryFunction *Function, diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1199,6 +1199,7 @@ }; uint64_t Size = 0; // instruction size + for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) { MCInst Instruction; const uint64_t AbsoluteInstrAddr = getAddress() + Offset; @@ -1440,6 +1441,9 @@ addInstruction(Offset, std::move(Instruction)); } + if (uint64_t Offset = getFirstInstructionOffset()) + Labels[Offset] = BC.Ctx->createNamedTempSymbol(); + clearList(Relocations); if (!IsSimple) { @@ -1952,7 +1956,7 @@ return false; assert(BasicBlocks.empty() && "basic block list should be empty"); - assert((Labels.find(0) != Labels.end()) && + assert((Labels.find(getFirstInstructionOffset()) != Labels.end()) && "first instruction should always have a label"); // Create basic blocks in the original layout order: @@ -3934,33 +3938,6 @@ } } -bool BinaryFunction::isDataMarker(const SymbolRef &Symbol, - uint64_t SymbolSize) const { - // For aarch64, the ABI defines mapping symbols so we identify data in the - // code section (see IHI0056B). $d identifies a symbol starting data contents. - if (BC.isAArch64() && Symbol.getType() && - cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 && - Symbol.getName() && - (cantFail(Symbol.getName()) == "$d" || - cantFail(Symbol.getName()).startswith("$d."))) - return true; - return false; -} - -bool BinaryFunction::isCodeMarker(const SymbolRef &Symbol, - uint64_t SymbolSize) const { - // For aarch64, the ABI defines mapping symbols so we identify data in the - // code section (see IHI0056B). $x identifies a symbol starting code or the - // end of a data chunk inside code. - if (BC.isAArch64() && Symbol.getType() && - cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && SymbolSize == 0 && - Symbol.getName() && - (cantFail(Symbol.getName()) == "$x" || - cantFail(Symbol.getName()).startswith("$x."))) - return true; - return false; -} - bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol, uint64_t SymbolSize) const { // If this symbol is in a different section from the one where the @@ -3971,7 +3948,7 @@ // Some symbols are tolerated inside function bodies, others are not. // The real function boundaries may not be known at this point. - if (isDataMarker(Symbol, SymbolSize) || isCodeMarker(Symbol, SymbolSize)) + if (BC.isMarker(Symbol)) return true; // It's okay to have a zero-sized symbol in the middle of non-zero-sized diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -880,47 +880,88 @@ std::vector SortedFileSymbols; std::copy_if(InputFile->symbol_begin(), InputFile->symbol_end(), std::back_inserter(SortedFileSymbols), isSymbolInMemory); + auto CompareSymbols = [this](const SymbolRef &A, const SymbolRef &B) { + // Marker symbols have the highest precedence, while + // SECTIONs have the lowest. + auto AddressA = cantFail(A.getAddress()); + auto AddressB = cantFail(B.getAddress()); + if (AddressA != AddressB) + return AddressA < AddressB; + + bool AMarker = BC->isMarker(A); + bool BMarker = BC->isMarker(B); + if (AMarker || BMarker) { + return AMarker && !BMarker; + } - std::stable_sort( - SortedFileSymbols.begin(), SortedFileSymbols.end(), - [](const SymbolRef &A, const SymbolRef &B) { - // FUNC symbols have the highest precedence, while SECTIONs - // have the lowest. - uint64_t AddressA = cantFail(A.getAddress()); - uint64_t AddressB = cantFail(B.getAddress()); - if (AddressA != AddressB) - return AddressA < AddressB; - - SymbolRef::Type AType = cantFail(A.getType()); - SymbolRef::Type BType = cantFail(B.getType()); - if (AType == SymbolRef::ST_Function && BType != SymbolRef::ST_Function) - return true; - if (BType == SymbolRef::ST_Debug && AType != SymbolRef::ST_Debug) - return true; + auto AType = cantFail(A.getType()); + auto BType = cantFail(B.getType()); + if (AType == SymbolRef::ST_Function && BType != SymbolRef::ST_Function) + return true; + if (BType == SymbolRef::ST_Debug && AType != SymbolRef::ST_Debug) + return true; - return false; - }); + return false; + }; + + std::stable_sort(SortedFileSymbols.begin(), SortedFileSymbols.end(), + CompareSymbols); + + auto LastSymbol = SortedFileSymbols.end() - 1; // For aarch64, the ABI defines mapping symbols so we identify data in the // code section (see IHI0056B). $d identifies data contents. - auto LastSymbol = SortedFileSymbols.end() - 1; + // Compilers usually merge multiple data objects in a single $d-$x interval, + // but we need every data object to be marked with $d. Because of that we + // create a vector of MarkerSyms with all locations of data objects. + + struct __attribute__((packed)) MarkerSym { + uint64_t Address; + MarkerSymType Type; + }; + + std::vector SortedMarkerSymbols; + auto addExtraDataMarkerPerSymbol = + [this](const std::vector &SortedFileSymbols, + std::vector &SortedMarkerSymbols) { + bool IsData = false; + uint64_t LastAddr = 0; + for (auto Sym = SortedFileSymbols.begin(); + Sym < SortedFileSymbols.end(); ++Sym) { + uint64_t Address = cantFail(Sym->getAddress()); + if (LastAddr == Address) // don't repeat markers + continue; + + MarkerSymType MarkerType = BC->getMarkerType(*Sym); + if (MarkerType != MarkerSymType::NONE) { + SortedMarkerSymbols.push_back(MarkerSym{Address, MarkerType}); + LastAddr = Address; + IsData = MarkerType == MarkerSymType::DATA; + continue; + } + + if (IsData) { + SortedMarkerSymbols.push_back( + MarkerSym{cantFail(Sym->getAddress()), MarkerSymType::DATA}); + LastAddr = Address; + } + } + }; + if (BC->isAArch64()) { + addExtraDataMarkerPerSymbol(SortedFileSymbols, SortedMarkerSymbols); LastSymbol = std::stable_partition( SortedFileSymbols.begin(), SortedFileSymbols.end(), - [](const SymbolRef &Symbol) { - StringRef Name = cantFail(Symbol.getName()); - return !(cantFail(Symbol.getType()) == SymbolRef::ST_Unknown && - (Name == "$d" || Name.startswith("$d.") || Name == "$x" || - Name.startswith("$x."))); - }); + [this](const SymbolRef &Symbol) { return !BC->isMarker(Symbol); }); --LastSymbol; } BinaryFunction *PreviousFunction = nullptr; unsigned AnonymousId = 0; - const auto MarkersBegin = std::next(LastSymbol); - for (auto ISym = SortedFileSymbols.begin(); ISym != MarkersBegin; ++ISym) { + const auto SortedSymbolsEnd = std::next(LastSymbol); + for (auto ISym = SortedFileSymbols.begin(); ISym != SortedSymbolsEnd; + ++ISym) { const SymbolRef &Symbol = *ISym; // Keep undefined symbols for pretty printing? if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Undefined) @@ -1213,25 +1254,24 @@ adjustFunctionBoundaries(); // Annotate functions with code/data markers in AArch64 - for (auto ISym = MarkersBegin; ISym != SortedFileSymbols.end(); ++ISym) { - const SymbolRef &Symbol = *ISym; - uint64_t Address = - cantFail(Symbol.getAddress(), "cannot get symbol address"); - uint64_t SymbolSize = ELFSymbolRef(Symbol).getSize(); - BinaryFunction *BF = - BC->getBinaryFunctionContainingAddress(Address, true, true); + for (auto ISym = SortedMarkerSymbols.begin(); + ISym != SortedMarkerSymbols.end(); ++ISym) { + + auto *BF = + BC->getBinaryFunctionContainingAddress(ISym->Address, true, true); + if (!BF) { // Stray marker continue; } - const uint64_t EntryOffset = Address - BF->getAddress(); - if (BF->isCodeMarker(Symbol, SymbolSize)) { + const auto EntryOffset = ISym->Address - BF->getAddress(); + if (ISym->Type == MarkerSymType::CODE) { BF->markCodeAtOffset(EntryOffset); continue; } - if (BF->isDataMarker(Symbol, SymbolSize)) { + if (ISym->Type == MarkerSymType::DATA) { BF->markDataAtOffset(EntryOffset); - BC->AddressToConstantIslandMap[Address] = BF; + BC->AddressToConstantIslandMap[ISym->Address] = BF; continue; } llvm_unreachable("Unknown marker"); diff --git a/bolt/test/AArch64/unmarked-data.s b/bolt/test/AArch64/unmarked-data.s new file mode 100644 --- /dev/null +++ b/bolt/test/AArch64/unmarked-data.s @@ -0,0 +1,31 @@ +// This test checks that multiple data objects in text of which only first is marked get disassembled properly + +// RUN: %clang %s -o %t.exe -Wl,-q +// COM: We make sure that symbol second is not marked as data by compiler. +// COM: We find it's address, then find all symbols at that address and check there is no $d marker +// RUN: out=`llvm-readelf -Ws %t.exe | grep second`; str=($(echo "$out" | tr '\t' '\n')); addr=${str[1]}; \ +// RUN: llvm-readelf -Ws %t.exe | grep $addr | FileCheck %s +// CHECK-NOT: $d +// RUN: llvm-bolt %t.exe -o %t.bolt -lite=0 -use-old-text=0 2>&1 | FileCheck %s +// CHECK-NOT: BOLT-WARNING +// RUN: llvm-objdump -j .text -d --disassemble-symbols=first,second %t.bolt | FileCheck %s -check-prefix=CHECK-SYMBOL +// CHECK-SYMBOL: : +// CHECK-SYMBOL: : + +.equ val, 0x070b0f03 // we use constant that is not a valid instruction so that it can't be silently dissassembled +.text + +first: + .xword val +.size first, .-first + +second: + .xword val +.size second, .-second + + .global main + .type main, %function +main: + mov x0, #0 + mov w8, #93 + svc #0