diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -549,6 +549,9 @@ std::unique_ptr DisAsm; + /// Symbolic disassembler. + std::unique_ptr SymbolicDisAsm; + std::unique_ptr MAB; /// Indicates if relocations are available for usage. diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -836,6 +836,15 @@ return make_range(JumpTables.begin(), JumpTables.end()); } + /// Return relocation associated with a given \p Offset in the function, + /// or nullptr if no such relocation exists. + const Relocation *getRelocationAt(uint64_t Offset) { + assert(CurrentState == State::Empty && + "Relocations unavailable in the current function state."); + auto RI = Relocations.find(Offset); + return (RI == Relocations.end()) ? nullptr : &RI->second; + } + /// Returns the raw binary encoding of this function. ErrorOr> getData() const; diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -21,6 +21,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/StringMap.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCDisassembler/MCSymbolizer.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrAnalysis.h" @@ -44,6 +45,7 @@ class raw_ostream; namespace bolt { +class BinaryFunction; /// Different types of indirect branches encountered during disassembly. enum class IndirectBranchType : char { @@ -286,6 +288,12 @@ initAliases(); } + /// Create and return target-specific MC symbolizer for the \p Function. + virtual std::unique_ptr + createTargetSymbolizer(BinaryFunction &Function) const { + return nullptr; + } + /// Initialize a new annotation allocator and return its id AllocatorIdTy initializeNewAnnotationAllocator() { AnnotationAllocators.emplace(MaxAllocatorId, AnnotationAllocator()); diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -251,6 +251,14 @@ BC->HasFixedLoadAddress = !IsPIC; + BC->SymbolicDisAsm = std::unique_ptr( + BC->TheTarget->createMCDisassembler(*BC->STI, *BC->Ctx)); + + if (!BC->SymbolicDisAsm) + return createStringError( + make_error_code(std::errc::not_supported), + Twine("BOLT-ERROR: no disassembler info for target ", TripleName)); + return std::move(BC); } diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1038,6 +1038,8 @@ auto &Ctx = BC.Ctx; auto &MIB = BC.MIB; + BC.SymbolicDisAsm->setSymbolizer(MIB->createTargetSymbolizer(*this)); + // Insert a label at the beginning of the function. This will be our first // basic block. Labels[0] = Ctx->createNamedTempSymbol("BB0"); @@ -1211,9 +1213,9 @@ continue; } - if (!BC.DisAsm->getInstruction(Instruction, Size, - FunctionData.slice(Offset), - AbsoluteInstrAddr, nulls())) { + if (!BC.SymbolicDisAsm->getInstruction(Instruction, Size, + FunctionData.slice(Offset), + AbsoluteInstrAddr, nulls())) { // Functions with "soft" boundaries, e.g. coming from assembly source, // can have 0-byte padding at the end. if (isZeroPaddingAt(Offset)) @@ -1253,12 +1255,16 @@ break; } - // Check if our disassembly is correct and matches the assembler output. - if (!BC.validateEncoding(Instruction, FunctionData.slice(Offset, Size))) { - if (opts::Verbosity >= 1) { + // Disassemble again without the symbolizer and check that the disassembly + // matches the assembler output. + MCInst TempInst; + BC.DisAsm->getInstruction(TempInst, Size, FunctionData.slice(Offset), + AbsoluteInstrAddr, nulls()); + if (!BC.validateEncoding(TempInst, FunctionData.slice(Offset, Size))) { + if (opts::Verbosity >= 0) { errs() << "BOLT-WARNING: internal assembler/disassembler error " "detected for AVX512 instruction:\n"; - BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr); + BC.printInstruction(errs(), TempInst, AbsoluteInstrAddr); errs() << " in function " << *this << '\n'; } @@ -1351,7 +1357,7 @@ if (BC.isAArch64()) handleAArch64IndirectCall(Instruction, Offset); } - } else { + } else if (BC.isAArch64()) { // Check if there's a relocation associated with this instruction. bool UsedReloc = false; for (auto Itr = Relocations.lower_bound(Offset), @@ -1362,60 +1368,17 @@ if (Relocation.isPCRelative()) SymbolValue += getAddress() + Relocation.Offset; - // Process reference to the symbol. - if (BC.isX86()) - BC.handleAddressRef(SymbolValue, *this, Relocation.isPCRelative()); - - if (BC.isAArch64() || !Relocation.isPCRelative()) { - int64_t Value = Relocation.Value; - const bool Result = BC.MIB->replaceImmWithSymbolRef( - Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(), - Value, Relocation.Type); - (void)Result; - assert(Result && "cannot replace immediate with relocation"); - - if (BC.isX86()) { - // Make sure we replaced the correct immediate (instruction - // can have multiple immediate operands). - assert( - truncateToSize(static_cast(Value), - Relocation::getSizeForType(Relocation.Type)) == - truncateToSize(Relocation.Value, Relocation::getSizeForType( - Relocation.Type)) && - "immediate value mismatch in function"); - } else if (BC.isAArch64()) { - // For aarch, if we replaced an immediate with a symbol from a - // relocation, we mark it so we do not try to further process a - // pc-relative operand. All we need is the symbol. - UsedReloc = true; - } - } else { - // Check if the relocation matches memop's Disp. - uint64_t TargetAddress; - if (!BC.MIB->evaluateMemOperandTarget(Instruction, TargetAddress, - AbsoluteInstrAddr, Size)) { - errs() << "BOLT-ERROR: PC-relative operand can't be evaluated\n"; - exit(1); - } - assert(TargetAddress == Relocation.Value + AbsoluteInstrAddr + Size && - "Immediate value mismatch detected."); - - const MCExpr *Expr = MCSymbolRefExpr::create( - Relocation.Symbol, MCSymbolRefExpr::VK_None, *BC.Ctx); - // Real addend for pc-relative targets is adjusted with a delta - // from relocation placement to the next instruction. - const uint64_t TargetAddend = - Relocation.Addend + Offset + Size - Relocation.Offset; - if (TargetAddend) { - const MCConstantExpr *Offset = - MCConstantExpr::create(TargetAddend, *BC.Ctx); - Expr = MCBinaryExpr::createAdd(Expr, Offset, *BC.Ctx); - } - BC.MIB->replaceMemOperandDisp( - Instruction, MCOperand::createExpr(BC.MIB->getTargetExprFor( - Instruction, Expr, *BC.Ctx, 0))); - UsedReloc = true; - } + int64_t Value = Relocation.Value; + const bool Result = BC.MIB->replaceImmWithSymbolRef( + Instruction, Relocation.Symbol, Relocation.Addend, Ctx.get(), Value, + Relocation.Type); + (void)Result; + assert(Result && "cannot replace immediate with relocation"); + + // For aarch64, if we replaced an immediate with a symbol from a + // relocation, we mark it so we do not try to further process a + // pc-relative operand. All we need is the symbol. + UsedReloc = true; } if (MIB->hasPCRelOperand(Instruction) && !UsedReloc) @@ -1442,6 +1405,9 @@ addInstruction(Offset, std::move(Instruction)); } + // Reset symbolizer for the disassembler. + BC.SymbolicDisAsm->setSymbolizer(nullptr); + clearList(Relocations); if (!IsSimple) { diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt --- a/bolt/lib/Core/CMakeLists.txt +++ b/bolt/lib/Core/CMakeLists.txt @@ -2,6 +2,7 @@ DebugInfoDWARF Demangle MC + MCDisassembler Object Support ) diff --git a/bolt/lib/Target/X86/CMakeLists.txt b/bolt/lib/Target/X86/CMakeLists.txt --- a/bolt/lib/Target/X86/CMakeLists.txt +++ b/bolt/lib/Target/X86/CMakeLists.txt @@ -2,12 +2,14 @@ BOLTCore BOLTUtils MC + MCDisassembler Support X86Desc ) add_llvm_library(LLVMBOLTTargetX86 X86MCPlusBuilder.cpp + X86MCSymbolizer.cpp DEPENDS X86CommonTableGen diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -13,6 +13,7 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86InstrRelaxTables.h" #include "MCTargetDesc/X86MCTargetDesc.h" +#include "X86MCSymbolizer.h" #include "bolt/Core/MCPlus.h" #include "bolt/Core/MCPlusBuilder.h" #include "llvm/BinaryFormat/ELF.h" @@ -81,6 +82,11 @@ const MCRegisterInfo *RegInfo) : MCPlusBuilder(Analysis, Info, RegInfo) {} + std::unique_ptr + createTargetSymbolizer(BinaryFunction &Function) const override { + return std::make_unique(Function); + } + bool isBranch(const MCInst &Inst) const override { return Analysis->isBranch(Inst) && !isTailCall(Inst); } diff --git a/bolt/lib/Target/X86/X86MCSymbolizer.h b/bolt/lib/Target/X86/X86MCSymbolizer.h new file mode 100644 --- /dev/null +++ b/bolt/lib/Target/X86/X86MCSymbolizer.h @@ -0,0 +1,43 @@ +//===- bolt/Target/X86/X86MCSymbolizer.h ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_CORE_X86MCSYMBOLIZER_H +#define BOLT_CORE_X86MCSYMBOLIZER_H + +#include "bolt/Core/BinaryFunction.h" +#include "llvm/MC/MCDisassembler/MCSymbolizer.h" + +namespace llvm { +namespace bolt { + +class X86MCSymbolizer : public MCSymbolizer { +protected: + BinaryFunction &Function; + +public: + X86MCSymbolizer(BinaryFunction &Function) + : MCSymbolizer(*Function.getBinaryContext().Ctx.get(), nullptr), + Function(Function) {} + + X86MCSymbolizer(const X86MCSymbolizer &) = delete; + X86MCSymbolizer &operator=(const X86MCSymbolizer &) = delete; + virtual ~X86MCSymbolizer(); + + bool tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &CStream, + int64_t Value, uint64_t Address, bool IsBranch, + uint64_t Offset, uint64_t OpSize, + uint64_t InstSize) override; + + void tryAddingPcLoadReferenceComment(raw_ostream &CStream, int64_t Value, + uint64_t Address) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/lib/Target/X86/X86MCSymbolizer.cpp b/bolt/lib/Target/X86/X86MCSymbolizer.cpp new file mode 100644 --- /dev/null +++ b/bolt/lib/Target/X86/X86MCSymbolizer.cpp @@ -0,0 +1,103 @@ +//===- bolt/Target/X86/X86MCSymbolizer.cpp --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "X86MCSymbolizer.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "bolt/Core/BinaryContext.h" +#include "bolt/Core/BinaryFunction.h" +#include "bolt/Core/MCPlusBuilder.h" +#include "bolt/Core/Relocation.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRegisterInfo.h" + +#define DEBUG_TYPE "bolt-symbolizer" + +namespace llvm { +namespace bolt { + +X86MCSymbolizer::~X86MCSymbolizer() {} + +bool X86MCSymbolizer::tryAddingSymbolicOperand( + MCInst &Inst, raw_ostream &CStream, int64_t Value, uint64_t InstAddress, + bool IsBranch, uint64_t ImmOffset, uint64_t ImmSize, uint64_t InstSize) { + if (IsBranch) + return false; + + BinaryContext &BC = Function.getBinaryContext(); + MCContext *Ctx = BC.Ctx.get(); + + if (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) + return false; + + /// Add symbolic operand to the instruction with an optional addend. + auto addOperand = [&](const MCSymbol *Symbol, uint64_t Addend) { + const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, *Ctx); + if (Addend) + Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Addend, *Ctx), + *Ctx); + Inst.addOperand(MCOperand::createExpr(Expr)); + }; + + // Check for relocations against the operand. + const uint64_t InstOffset = InstAddress - Function.getAddress(); + if (const Relocation *Relocation = + Function.getRelocationAt(InstOffset + ImmOffset)) { + uint64_t SymbolValue = Relocation->Value - Relocation->Addend; + if (Relocation->isPCRelative()) + SymbolValue += InstAddress + ImmOffset; + + // Process reference to the symbol. + BC.handleAddressRef(SymbolValue, Function, Relocation->isPCRelative()); + + uint64_t Addend = Relocation->Addend; + // Real addend for pc-relative targets is adjusted with a delta from + // the relocation placement to the next instruction. + if (Relocation->isPCRelative()) + Addend += InstOffset + InstSize - Relocation->Offset; + + addOperand(Relocation->Symbol, Addend); + + return true; + } + + // Check if the operand being added is a displacement part of a compound + // memory operand that uses PC-relative addressing. If it is, try to symbolize + // it without relocations. + const int MemOp = BC.MIB->getMemoryOperandNo(Inst); + if (MemOp == -1) + return false; + + const unsigned DispOp = MemOp + X86::AddrDisp; + if (Inst.getNumOperands() != DispOp) + return false; + + const MCOperand &Base = Inst.getOperand(MemOp + X86::AddrBaseReg); + if (Base.getReg() != BC.MRI->getProgramCounter()) + return false; + + const MCOperand &Scale = Inst.getOperand(MemOp + X86::AddrScaleAmt); + const MCOperand &Index = Inst.getOperand(MemOp + X86::AddrIndexReg); + if (Scale.getImm() != 0 && Index.getReg() != MCRegister::NoRegister) + return false; + + const MCSymbol *TargetSymbol; + uint64_t TargetOffset; + std::tie(TargetSymbol, TargetOffset) = + BC.handleAddressRef(Value, Function, /*IsPCRel*/ true); + + addOperand(TargetSymbol, TargetOffset); + + return true; +} + +void X86MCSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &CStream, + int64_t Value, + uint64_t Address) {} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/test/X86/double-rel.s b/bolt/test/X86/double-rel.s new file mode 100644 --- /dev/null +++ b/bolt/test/X86/double-rel.s @@ -0,0 +1,42 @@ +## Check that BOLT can correctly use relocations to symbolize instruction +## operands when an instruction can have up to two relocations associated +## with it. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux %s -o %t.o +# RUN: ld.lld %t.o -o %t.exe -q --Tdata=0x80000 +# RUN: llvm-bolt %t.exe -relocs -o /dev/null -print-only=_start -print-disasm \ +# RUN: | FileCheck %s --check-prefix=CHECK-BOLT +# RUN: llvm-objdump -d --print-imm-hex %t.exe \ +# RUN: | FileCheck %s --check-prefix=CHECK-OBJDUMP + + .data + .globl VAR +VAR: + .quad + + .text + .globl _start + .type _start,@function +_start: + .cfi_startproc + +## VAR value is 0x80000. Using relocations, llvm-bolt should correctly +## symbolize the instruction operands. + + movq $VAR, 0x80000 +# CHECK-BOLT: movq $VAR, 0x80000 +# CHECK-OBJDUMP: movq $0x80000, 0x80000 + + movq $0x80000, VAR +# CHECK-BOLT-NEXT: movq $0x80000, VAR +# CHECK-OBJDUMP-NEXT: movq $0x80000, 0x80000 + + movq $VAR, VAR +# CHECK-BOLT-NEXT: movq $VAR, VAR +# CHECK-OBJDUMP-NEXT: movq $0x80000, 0x80000 + + retq + .size _start, .-_start + .cfi_endproc