Index: lld/ELF/Arch/X86_64.cpp =================================================================== --- lld/ELF/Arch/X86_64.cpp +++ lld/ELF/Arch/X86_64.cpp @@ -46,9 +46,22 @@ void relaxTlsLdToLe(uint8_t *loc, RelType type, uint64_t val) const override; bool adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end, uint8_t stOther) const override; + bool deleteFallThruJmpInsn(InputSection &IS, InputFile *File, + InputSection *NextIS) const override; }; } // namespace +static std::vector> X86_NOP_INSTRUCTIONS = { + {0x90}, + {0x66, 0x90}, + {0x0f, 0x1f, 0x00}, + {0x0f, 0x1f, 0x40, 0x00}, + {0x0f, 0x1f, 0x44, 0x00, 0x00}, + {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, + {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00}, + {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}}; + X86_64::X86_64() { copyRel = R_X86_64_COPY; gotRel = R_X86_64_GLOB_DAT; @@ -73,6 +86,208 @@ int X86_64::getTlsGdRelaxSkip(RelType type) const { return 2; } +// Opcodes for the different X86_64 jmp instructions. +enum JmpInsnOpcode : uint32_t { + J_JMP_32, + J_JNE_32, + J_JE_32, + J_JG_32, + J_JGE_32, + J_JB_32, + J_JBE_32, + J_JL_32, + J_JLE_32, + J_JA_32, + J_JAE_32, + J_UNKNOWN, +}; + +// Given the first (optional) and second byte of the insn's opcode, this +// returns the corresponding enum value. +static JmpInsnOpcode getJmpInsnType(const uint8_t *First, + const uint8_t *Second) { + if (*Second == 0xe9) + return J_JMP_32; + + if (First == nullptr) + return J_UNKNOWN; + + if (*First == 0x0f) { + switch (*Second) { + case 0x84: + return J_JE_32; + case 0x85: + return J_JNE_32; + case 0x8f: + return J_JG_32; + case 0x8d: + return J_JGE_32; + case 0x82: + return J_JB_32; + case 0x86: + return J_JBE_32; + case 0x8c: + return J_JL_32; + case 0x8e: + return J_JLE_32; + case 0x87: + return J_JA_32; + case 0x83: + return J_JAE_32; + } + } + return J_UNKNOWN; +} + +// Return the relocation index for input section IS with a specific Offset. +// Returns the maximum size of the vector if no such relocation is found. +static unsigned getRelocationWithOffset(const InputSection &IS, + uint64_t Offset) { + unsigned I = 0; + for (; I < IS.relocations.size(); ++I) { + if (IS.relocations[I].offset == Offset && IS.relocations[I].expr != R_NONE) + break; + } + return I; +} + +static bool isRelocationForJmpInsn(Relocation &R) { + return (R.type == R_X86_64_PLT32 || R.type == R_X86_64_PC32 || + R.type == R_X86_64_PC8); +} + +static bool isDirectJmpInsnOpcode(const uint8_t *Opcode) { + return (*Opcode == 0xe9); +} + +// Return true if Relocaction R points to the first instruction in the +// next section. +static bool isFallThruRelocation(InputSection &IS, InputFile *File, + InputSection *NextIS, Relocation &R) { + if (!isRelocationForJmpInsn(R)) + return false; + + uint64_t AddrLoc = (IS.getOutputSection())->addr + IS.outSecOff + R.offset; + uint64_t TargetOffset = + SignExtend64(InputSectionBase::getRelocTargetVA(File, R.type, R.addend, + AddrLoc, *R.sym, R.expr), + (config->wordsize * 8)); + + // If this jmp is a fall thru, the target offset is the beginning of the + // next section. + uint64_t NextSectionOffset = + NextIS->getOutputSection()->addr + NextIS->outSecOff; + if ((AddrLoc + 4 + TargetOffset) != NextSectionOffset) + return false; + + return true; +} + +// Return the jmp instruction opcode that is the inverse of the given +// opcode. For example, JE inverted is JNE. +static JmpInsnOpcode invertJmpOpcode(const JmpInsnOpcode opcode) { + switch (opcode) { + case J_JE_32: + return J_JNE_32; + case J_JNE_32: + return J_JE_32; + case J_JG_32: + return J_JLE_32; + case J_JGE_32: + return J_JL_32; + case J_JB_32: + return J_JAE_32; + case J_JBE_32: + return J_JA_32; + case J_JL_32: + return J_JGE_32; + case J_JLE_32: + return J_JG_32; + case J_JA_32: + return J_JBE_32; + case J_JAE_32: + return J_JB_32; + default: + return J_UNKNOWN; + } + return J_UNKNOWN; +} + +// Deletes direct jump instruction in input sections that jumps to the +// following section as it is not required. If there are two consecutive jump +// instructions, it checks if they can be flipped and one can be deleted. +bool X86_64::deleteFallThruJmpInsn(InputSection &IS, InputFile *File, + InputSection *NextIS) const { + const unsigned SizeOfDirectJmpInsn = 5; + + if (NextIS == nullptr) + return false; + + if (IS.getSize() < SizeOfDirectJmpInsn) + return false; + + // If this jmp insn can be removed, it is the last insn and the + // relocation is 4 bytes before the end. + unsigned RIndex = getRelocationWithOffset(IS, (IS.getSize() - 4)); + if (RIndex == IS.relocations.size()) + return false; + + Relocation &R = IS.relocations[RIndex]; + + // Check if the relocation corresponds to a direct jmp. + const uint8_t *SecContents = IS.data().data(); + if (!isDirectJmpInsnOpcode(SecContents + R.offset - 1)) + return false; + + if (isFallThruRelocation(IS, File, NextIS, R)) { + // This is a fall thru and can be deleted. + R.expr = R_NONE; + R.offset = 0; + IS.drop_back(SizeOfDirectJmpInsn); + IS.SpecialFiller = X86_NOP_INSTRUCTIONS; + return true; + } + + // Now, check if flip and delete is possible. + const unsigned SizeOfJmpCCInsn = 6; + // To flip, there must be atleast one JmpCC and one direct jmp. + if (IS.getSize() < (SizeOfDirectJmpInsn + SizeOfJmpCCInsn)) + return 0; + + unsigned RbIndex = + getRelocationWithOffset(IS, (IS.getSize() - SizeOfDirectJmpInsn - 4)); + if (RbIndex == IS.relocations.size()) + return 0; + + Relocation &Rb = IS.relocations[RbIndex]; + + const uint8_t *JmpInsnB = SecContents + Rb.offset - 1; + JmpInsnOpcode JO_B = getJmpInsnType(JmpInsnB - 1, JmpInsnB); + if (JO_B == J_UNKNOWN) + return false; + + if (!isFallThruRelocation(IS, File, NextIS, Rb)) + return false; + + // jmpCC jumps to the fall thru block, the branch can be flipped and the + // jmp can be deleted. + JmpInsnOpcode JInvert = invertJmpOpcode(JO_B); + if (JInvert == J_UNKNOWN) + return false; + IS.addJumpRelocation({JInvert, (Rb.offset - 1), 4}); + // Move R's values to Rb + Rb.expr = R.expr; + Rb.type = R.type; + Rb.addend = R.addend; + Rb.sym = R.sym; + // Cancel R + R.expr = R_NONE; + R.offset = 0; + IS.drop_back(SizeOfDirectJmpInsn); + IS.SpecialFiller = X86_NOP_INSTRUCTIONS; + return true; +} + RelExpr X86_64::getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const { if (type == R_X86_64_GOTTPOFF) Index: lld/ELF/Config.h =================================================================== --- lld/ELF/Config.h +++ lld/ELF/Config.h @@ -113,6 +113,7 @@ llvm::StringRef sysroot; llvm::StringRef thinLTOCacheDir; llvm::StringRef thinLTOIndexOnlyArg; + llvm::StringRef ltoBBSections; std::pair thinLTOObjectSuffixReplace; std::pair thinLTOPrefixReplace; std::string rpath; @@ -165,6 +166,7 @@ bool ltoCSProfileGenerate; bool ltoDebugPassManager; bool ltoNewPassManager; + bool ltoUniqueBBSectionNames; bool mergeArmExidx; bool mipsN32Abi = false; bool mmapOutputFile; @@ -173,6 +175,7 @@ bool nostdlib; bool oFormatBinary; bool omagic; + bool optimizeBBJumps; bool optRemarksWithHotness; bool pacPlt; bool picThunk; Index: lld/ELF/Driver.cpp =================================================================== --- lld/ELF/Driver.cpp +++ lld/ELF/Driver.cpp @@ -27,6 +27,7 @@ #include "ICF.h" #include "InputFiles.h" #include "InputSection.h" +#include "LinkerPropeller.h" #include "LinkerScript.h" #include "MarkLive.h" #include "OutputSections.h" @@ -853,6 +854,9 @@ config->cref = args.hasFlag(OPT_cref, OPT_no_cref, false); config->defineCommon = args.hasFlag(OPT_define_common, OPT_no_define_common, !args.hasArg(OPT_relocatable)); + config->optimizeBBJumps = + args.hasFlag(OPT_optimize_bb_jumps, OPT_no_optimize_bb_jumps, false); + config->demangle = args.hasFlag(OPT_demangle, OPT_no_demangle, true); config->dependentLibraries = args.hasFlag(OPT_dependent_libraries, OPT_no_dependent_libraries, true); config->disableVerify = args.hasArg(OPT_disable_verify); @@ -896,6 +900,10 @@ config->ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq); config->ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1); config->ltoSampleProfile = args.getLastArgValue(OPT_lto_sample_profile); + config->ltoBBSections = args.getLastArgValue(OPT_lto_basicblock_sections); + config->ltoUniqueBBSectionNames = + args.hasFlag(OPT_lto_unique_bb_section_names, + OPT_no_lto_unique_bb_section_names, false); config->mapFile = args.getLastArgValue(OPT_Map); config->mipsGotSize = args::getInteger(args, OPT_mips_got_size, 0xfff0); config->mergeArmExidx = Index: lld/ELF/InputSection.cpp =================================================================== --- lld/ELF/InputSection.cpp +++ lld/ELF/InputSection.cpp @@ -646,8 +646,9 @@ } } -static uint64_t getRelocTargetVA(const InputFile *file, RelType type, int64_t a, - uint64_t p, const Symbol &sym, RelExpr expr) { +uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type, + int64_t a, uint64_t p, + const Symbol &sym, RelExpr expr) { switch (expr) { case R_ABS: case R_DTPREL: @@ -854,6 +855,12 @@ if (expr == R_NONE) continue; + if (expr == R_SIZE) { + target->relocateOne(bufLoc, type, + SignExtend64(sym.getSize() + addend)); + continue; + } + if (expr != R_ABS && expr != R_DTPREL && expr != R_RISCV_ADD) { std::string msg = getLocation(offset) + ": has non-ABS relocation " + toString(type) + @@ -924,6 +931,8 @@ const unsigned bits = config->wordsize * 8; for (const Relocation &rel : relocations) { + if (rel.expr == R_NONE) + continue; uint64_t offset = rel.offset; if (auto *sec = dyn_cast(this)) offset += sec->outSecOff; Index: lld/ELF/LTO.cpp =================================================================== --- lld/ELF/LTO.cpp +++ lld/ELF/LTO.cpp @@ -33,6 +33,7 @@ #include "llvm/Support/MemoryBuffer.h" #include #include +#include #include #include #include @@ -58,6 +59,53 @@ return ret; } +// Basic Block Sections can be enabled for a subset of machine basic blocks. +// This is done by passing a file containing names of functions for which basic +// block sections are desired. Additionally, machine basic block ids of the +// functions can also be specified for a finer granularity. +// A file with basic block sections for all of function main and two blocks for +// function foo looks like this: +// ---------------------------- +// list.txt: +// !main +// !foo +// !!2 +// !!4 +static bool getBBSectionsList(TargetOptions &Options) { + if (config->ltoBBSections.empty()) + return false; + + std::ifstream fin(config->ltoBBSections); + if (!fin.good()) { + errs() << "Cannot open " + config->ltoBBSections; + return false; + } + StringMap>::iterator fi = Options.BBSectionsList.end(); + std::string line; + while ((std::getline(fin, line)).good()) { + StringRef S(line); + // Lines beginning with @, # are not useful here. + if (S.empty() || S[0] == '@' || S[0] == '#') + continue; + if (!S.consume_front("!") || S.empty()) + break; + if (S.consume_front("!")) { + if (fi != Options.BBSectionsList.end()) + fi->second.insert(std::stoi(S)); + else { + errs() << "Found \"!!\" without preceding \"!\""; + return false; + } + } else { + // Start a new function. + auto R = Options.BBSectionsList.try_emplace(S.split('/').first); + fi = R.first; + assert(R.second); + } + } + return true; +} + static std::string getThinLTOOutputFile(StringRef modulePath) { return lto::getThinLTOOutputFile(modulePath, config->thinLTOPrefixReplace.first, @@ -76,6 +124,22 @@ c.Options.FunctionSections = true; c.Options.DataSections = true; + // Check if basic block sections must be used. + if (!config->ltoBBSections.empty()) { + if (config->ltoBBSections.equals("all")) + c.Options.BBSections = BasicBlockSection::All; + else if (config->ltoBBSections.equals("labels")) + c.Options.BBSections = BasicBlockSection::Labels; + else if (config->ltoBBSections.equals("none")) + c.Options.BBSections = BasicBlockSection::None; + else { + getBBSectionsList(c.Options); + c.Options.BBSections = BasicBlockSection::List; + } + } + + c.Options.UniqueBBSectionNames = config->ltoUniqueBBSectionNames; + if (auto relocModel = getRelocModelFromCMModel()) c.RelocModel = *relocModel; else if (config->relocatable) Index: lld/ELF/Options.td =================================================================== --- lld/ELF/Options.td +++ lld/ELF/Options.td @@ -42,6 +42,10 @@ defm defsym: Eq<"defsym", "Define a symbol alias">, MetaVarName<"=">; +defm optimize_bb_jumps: B<"optimize-bb-jumps", + "Remove direct jumps at the end to the next basic block", + "Do not remove any direct jumps at the end to the next basic block">; + defm split_stack_adjust_size : Eq<"split-stack-adjust-size", "Specify adjustment to stack size when a split-stack function calls a " @@ -491,6 +495,11 @@ HelpText<"The format used for serializing remarks (default: YAML)">; defm plugin_opt: Eq<"plugin-opt", "specifies LTO options for compatibility with GNU linkers">; def save_temps: F<"save-temps">; +def lto_basicblock_sections: J<"lto-basicblock-sections=">, + HelpText<"Enable basic block sections for LTO">; +defm lto_unique_bb_section_names: B<"lto-unique-bb-section-names", + "Give unique names to every basic block section for LTO", + "Do not give unique names to every basic block section for LTO">; def thinlto_cache_dir: J<"thinlto-cache-dir=">, HelpText<"Path to ThinLTO cached object file directory">; defm thinlto_cache_policy: Eq<"thinlto-cache-policy", "Pruning policy for the ThinLTO cache">; Index: lld/ELF/OutputSections.cpp =================================================================== --- lld/ELF/OutputSections.cpp +++ lld/ELF/OutputSections.cpp @@ -243,6 +243,22 @@ sortByOrder(isd->sections, order); } +static void fill(uint8_t *Buf, size_t Size, + const std::vector> &SFiller) { + unsigned I = 0; + unsigned NC = Size / SFiller.back().size(); + for (unsigned C = 0; C < NC; ++C) { + memcpy(Buf + I, SFiller.back().data(), SFiller.back().size()); + I += SFiller.back().size(); + } + unsigned remaining = Size - I; + if (!remaining) + return; + if (SFiller.at(remaining - 1).size() != remaining) + fatal("Failed padding with special filler."); + memcpy(Buf + I, SFiller.at(remaining - 1).data(), remaining); +} + // Fill [Buf, Buf + Size) with Filler. // This is used for linker script "=fillexp" command. static void fill(uint8_t *buf, size_t size, @@ -331,7 +347,13 @@ end = buf + size; else end = buf + sections[i + 1]->outSecOff; - fill(start, end - start, filler); + // Check if this IS needs a special filler. + if (isec->SpecialFiller) + fill(start, end - start, *(isec->SpecialFiller)); + else if (isec->Filler) + fill(start, end - start, *(isec->Filler)); + else + fill(start, end - start, filler); } }); Index: lld/ELF/Target.h =================================================================== --- lld/ELF/Target.h +++ lld/ELF/Target.h @@ -86,6 +86,11 @@ virtual ~TargetInfo(); + virtual bool deleteFallThruJmpInsn(InputSection &IS, InputFile *File, + InputSection *NextIS) const { + return false; + } + unsigned defaultCommonPageSize = 4096; unsigned defaultMaxPageSize = 4096; Index: lld/ELF/Writer.cpp =================================================================== --- lld/ELF/Writer.cpp +++ lld/ELF/Writer.cpp @@ -29,7 +29,8 @@ #include "llvm/Support/SHA1.h" #include "llvm/Support/xxhash.h" #include +#define DEBUG_TYPE "lld" using namespace llvm; using namespace llvm::ELF; using namespace llvm::object; @@ -56,6 +57,7 @@ void sortSections(); void resolveShfLinkOrder(); void finalizeAddressDependentContent(); + void optimizeBasicBlockJumps(); void sortInputSections(); void finalizeSections(); void checkExecuteOnly(); @@ -1603,6 +1605,83 @@ } } +// If Input Sections have been shrinked (basic block sections) then +// update symbol values and sizes associated with these sections. +static void fixSymbolsAfterShrinking() { + for (InputFile *File : objectFiles) { + parallelForEach(File->getSymbols(), [&](Symbol *Sym) { + auto *Def = dyn_cast(Sym); + if (!Def) + return; + + const auto *Sec = Def->section; + if (!Sec) + return; + + const auto *InputSec = dyn_cast(Sec->repl); + if (!InputSec || !InputSec->BytesDropped) + return; + + const auto NewSize = InputSec->data().size(); + + if (Def->value > NewSize) { + LLVM_DEBUG(llvm::dbgs() + << "Moving symbol " << Sym->getName() << " from " + << Def->value << " to " + << Def->value - InputSec->BytesDropped << " bytes\n"); + Def->value -= InputSec->BytesDropped; + return; + } + + if (Def->value + Def->size > NewSize) { + LLVM_DEBUG(llvm::dbgs() + << "Shrinking symbol " << Sym->getName() << " from " + << Def->size << " to " << Def->size - InputSec->BytesDropped + << " bytes\n"); + Def->size -= InputSec->BytesDropped; + } + }); + } +} + +// If basic block sections exist, there are opportunities to delete fall thru +// jumps and shrink jump instructions after basic block reordering. This +// relaxation pass does that. +template void Writer::optimizeBasicBlockJumps() { + if (!config->optimizeBBJumps || !ELFT::Is64Bits) + return; + + script->assignAddresses(); + // For every output section that has executable input sections, this + // does 3 things: + // 1. It deletes all direct jump instructions in input sections that + // jump to the following section as it is not required. If there + // are two consecutive jump instructions, it checks if they can be + // flipped and one can be deleted. + for (OutputSection *OS : outputSections) { + if (!(OS->flags & SHF_EXECINSTR)) + continue; + std::vector Sections = getInputSections(OS); + std::vector Result(Sections.size()); + // Step 1: Delete all fall through jump instructions. Also, check if two + // consecutive jump instructions can be flipped so that a fall through jmp + // instruction can be deleted. + parallelForEachN(0, Sections.size(), [&](size_t I) { + InputSection *Next = + (I + 1) < Sections.size() ? Sections[I + 1] : nullptr; + InputSection &IS = *Sections[I]; + Result[I] = target->deleteFallThruJmpInsn(IS, IS.getFile(), Next); + }); + size_t NumDeleted = std::count(Result.begin(), Result.end(), true); + if (NumDeleted > 0) { + script->assignAddresses(); + LLVM_DEBUG(llvm::dbgs() + << "Removing " << NumDeleted << " fall through jumps\n"); + } + } + fixSymbolsAfterShrinking(); +} + static void finalizeSynthetic(SyntheticSection *sec) { if (sec && sec->isNeeded() && sec->getParent()) sec->finalizeContents(); @@ -1912,6 +1991,10 @@ finalizeSynthetic(in.symTab); finalizeSynthetic(in.ppc64LongBranchTarget); + // Relaxation to delete inter-basic block jumps created by basic block + // sections. + optimizeBasicBlockJumps(); + // Fill other section headers. The dynamic table is finalized // at the end because some tags like RELSZ depend on result // of finalizing other sections. Index: lld/test/ELF/bb-sections-optimize-jumps.s =================================================================== --- /dev/null +++ lld/test/ELF/bb-sections-optimize-jumps.s @@ -0,0 +1,48 @@ +# REQUIRES: x86 +## basicblock-sections tests. +## This test exercises optimizing the jumps (flipping and removing fall-through jumps) +## at the end of basic blocks on a single function with a simple loop. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld -optimize-bb-jumps %t.o -o %t.out +# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=CHECK + +# CHECK: foo: +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: 74 03 je 3 + +# CHECK: a.BB.foo: +# CHECK-NEXT: nopl (%rax) + +# CHECK: aa.BB.foo: +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: 75 f8 jne -8 + +# CHECK: aaa.BB.foo: +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: retq + + +.section .text,"ax",@progbits +# -- Begin function foo +.type foo,@function +foo: + nopl (%rax) + jne a.BB.foo + jmp aa.BB.foo + +.section .text,"ax",@progbits,unique,2 +a.BB.foo: + nopl (%rax) + jmp aa.BB.foo + +.section .text,"ax",@progbits,unique,3 +aa.BB.foo: + nopl (%rax) + jne a.BB.foo + jmp aaa.BB.foo + +.section .text,"ax",@progbits,unique,4 +aaa.BB.foo: + nopl (%rax) + ret