diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp --- a/lld/ELF/Arch/X86_64.cpp +++ b/lld/ELF/Arch/X86_64.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "InputFiles.h" +#include "OutputSections.h" #include "Symbols.h" #include "SyntheticSections.h" #include "Target.h" @@ -37,6 +38,8 @@ uint64_t pltEntryAddr) const override; void relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const override; + void applyJumpInstrMod(uint8_t *loc, JumpModType type, + unsigned size) const override; RelExpr adjustRelaxExpr(RelType type, const uint8_t *data, RelExpr expr) const override; @@ -52,9 +55,25 @@ uint64_t val) const override; bool adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end, uint8_t stOther) const override; + bool deleteFallThruJmpInsn(InputSection &is, InputFile *file, + InputSection *nextIS) const override; }; } // namespace +// This is vector of NOP instructions of sizes from 1 to 8 bytes. The +// appropriately sized instructions are used to fill the gaps between sections +// which are executed during fall through. +static const std::vector> nopInstructions = { + {0x90}, + {0x66, 0x90}, + {0x0f, 0x1f, 0x00}, + {0x0f, 0x1f, 0x40, 0x00}, + {0x0f, 0x1f, 0x44, 0x00, 0x00}, + {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, + {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00}, + {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}}; + X86_64::X86_64() { copyRel = R_X86_64_COPY; gotRel = R_X86_64_GLOB_DAT; @@ -71,6 +90,7 @@ pltEntrySize = 16; ipltEntrySize = 16; trapInstr = {0xcc, 0xcc, 0xcc, 0xcc}; // 0xcc = INT3 + nopInstrs = nopInstructions; // Align to the large page size (known as a superpage or huge page). // FreeBSD automatically promotes large, superpage-aligned allocations. @@ -79,6 +99,216 @@ int X86_64::getTlsGdRelaxSkip(RelType type) const { return 2; } +// Opcodes for the different X86_64 jmp instructions. +enum JmpInsnOpcode : uint32_t { + J_JMP_32, + J_JNE_32, + J_JE_32, + J_JG_32, + J_JGE_32, + J_JB_32, + J_JBE_32, + J_JL_32, + J_JLE_32, + J_JA_32, + J_JAE_32, + J_UNKNOWN, +}; + +// Given the first (optional) and second byte of the insn's opcode, this +// returns the corresponding enum value. +static JmpInsnOpcode getJmpInsnType(const uint8_t *first, + const uint8_t *second) { + if (*second == 0xe9) + return J_JMP_32; + + if (first == nullptr) + return J_UNKNOWN; + + if (*first == 0x0f) { + switch (*second) { + case 0x84: + return J_JE_32; + case 0x85: + return J_JNE_32; + case 0x8f: + return J_JG_32; + case 0x8d: + return J_JGE_32; + case 0x82: + return J_JB_32; + case 0x86: + return J_JBE_32; + case 0x8c: + return J_JL_32; + case 0x8e: + return J_JLE_32; + case 0x87: + return J_JA_32; + case 0x83: + return J_JAE_32; + } + } + return J_UNKNOWN; +} + +// Return the relocation index for input section IS with a specific Offset. +// Returns the maximum size of the vector if no such relocation is found. +static unsigned getRelocationWithOffset(const InputSection &is, + uint64_t offset) { + unsigned size = is.relocations.size(); + for (unsigned i = size - 1; i + 1 > 0; --i) { + if (is.relocations[i].offset == offset && is.relocations[i].expr != R_NONE) + return i; + } + return size; +} + +// Returns true if R corresponds to a relocation used for a jump instruction. +// TODO: Once special relocations for relaxable jump instructions are available, +// this should be modified to use those relocations. +static bool isRelocationForJmpInsn(Relocation &R) { + return R.type == R_X86_64_PLT32 || R.type == R_X86_64_PC32 || + R.type == R_X86_64_PC8; +} + +// Return true if Relocation R points to the first instruction in the +// next section. +// TODO: Delete this once psABI reserves a new relocation type for fall thru +// jumps. +static bool isFallThruRelocation(InputSection &is, InputFile *file, + InputSection *nextIS, Relocation &r) { + if (!isRelocationForJmpInsn(r)) + return false; + + uint64_t addrLoc = is.getOutputSection()->addr + is.outSecOff + r.offset; + uint64_t targetOffset = InputSectionBase::getRelocTargetVA( + file, r.type, r.addend, addrLoc, *r.sym, r.expr); + + // If this jmp is a fall thru, the target offset is the beginning of the + // next section. + uint64_t nextSectionOffset = + nextIS->getOutputSection()->addr + nextIS->outSecOff; + return (addrLoc + 4 + targetOffset) == nextSectionOffset; +} + +// Return the jmp instruction opcode that is the inverse of the given +// opcode. For example, JE inverted is JNE. +static JmpInsnOpcode invertJmpOpcode(const JmpInsnOpcode opcode) { + switch (opcode) { + case J_JE_32: + return J_JNE_32; + case J_JNE_32: + return J_JE_32; + case J_JG_32: + return J_JLE_32; + case J_JGE_32: + return J_JL_32; + case J_JB_32: + return J_JAE_32; + case J_JBE_32: + return J_JA_32; + case J_JL_32: + return J_JGE_32; + case J_JLE_32: + return J_JG_32; + case J_JA_32: + return J_JBE_32; + case J_JAE_32: + return J_JB_32; + default: + return J_UNKNOWN; + } +} + +// Deletes direct jump instruction in input sections that jumps to the +// following section as it is not required. If there are two consecutive jump +// instructions, it checks if they can be flipped and one can be deleted. +// For example: +// .section .text +// a.BB.foo: +// ... +// 10: jne aa.BB.foo +// 16: jmp bar +// aa.BB.foo: +// ... +// +// can be converted to: +// a.BB.foo: +// ... +// 10: je bar #jne flipped to je and the jmp is deleted. +// aa.BB.foo: +// ... +bool X86_64::deleteFallThruJmpInsn(InputSection &is, InputFile *file, + InputSection *nextIS) const { + const unsigned sizeOfDirectJmpInsn = 5; + + if (nextIS == nullptr) + return false; + + if (is.getSize() < sizeOfDirectJmpInsn) + return false; + + // If this jmp insn can be removed, it is the last insn and the + // relocation is 4 bytes before the end. + unsigned rIndex = getRelocationWithOffset(is, is.getSize() - 4); + if (rIndex == is.relocations.size()) + return false; + + Relocation &r = is.relocations[rIndex]; + + // Check if the relocation corresponds to a direct jmp. + const uint8_t *secContents = is.data().data(); + // If it is not a direct jmp instruction, there is nothing to do here. + if (*(secContents + r.offset - 1) != 0xe9) + return false; + + if (isFallThruRelocation(is, file, nextIS, r)) { + // This is a fall thru and can be deleted. + r.expr = R_NONE; + r.offset = 0; + is.drop_back(sizeOfDirectJmpInsn); + is.nopFiller = true; + return true; + } + + // Now, check if flip and delete is possible. + const unsigned sizeOfJmpCCInsn = 6; + // To flip, there must be atleast one JmpCC and one direct jmp. + if (is.getSize() < sizeOfDirectJmpInsn + sizeOfJmpCCInsn) + return 0; + + unsigned rbIndex = + getRelocationWithOffset(is, (is.getSize() - sizeOfDirectJmpInsn - 4)); + if (rbIndex == is.relocations.size()) + return 0; + + Relocation &rB = is.relocations[rbIndex]; + + const uint8_t *jmpInsnB = secContents + rB.offset - 1; + JmpInsnOpcode jmpOpcodeB = getJmpInsnType(jmpInsnB - 1, jmpInsnB); + if (jmpOpcodeB == J_UNKNOWN) + return false; + + if (!isFallThruRelocation(is, file, nextIS, rB)) + return false; + + // jmpCC jumps to the fall thru block, the branch can be flipped and the + // jmp can be deleted. + JmpInsnOpcode jInvert = invertJmpOpcode(jmpOpcodeB); + if (jInvert == J_UNKNOWN) + return false; + is.jumpInstrMods.push_back({jInvert, (rB.offset - 1), 4}); + // Move R's values to rB except the offset. + rB = {r.expr, r.type, rB.offset, r.addend, r.sym}; + // Cancel R + r.expr = R_NONE; + r.offset = 0; + is.drop_back(sizeOfDirectJmpInsn); + is.nopFiller = true; + return true; +} + RelExpr X86_64::getRelExpr(RelType type, const Symbol &s, const uint8_t *loc) const { if (type == R_X86_64_GOTTPOFF) @@ -357,6 +587,94 @@ "expected R_X86_64_PLT32 or R_X86_64_GOTPCRELX after R_X86_64_TLSLD"); } +// A JumpInstrMod at a specific offset indicates that the jump instruction +// opcode at that offset must be modified. This is specifically used to relax +// jump instructions with basic block sections. This function looks at the +// JumpMod and effects the change. +void X86_64::applyJumpInstrMod(uint8_t *loc, JumpModType type, + unsigned size) const { + switch (type) { + case J_JMP_32: + if (size == 4) + *loc = 0xe9; + else + *loc = 0xeb; + break; + case J_JE_32: + if (size == 4) { + loc[-1] = 0x0f; + *loc = 0x84; + } else + *loc = 0x74; + break; + case J_JNE_32: + if (size == 4) { + loc[-1] = 0x0f; + *loc = 0x85; + } else + *loc = 0x75; + break; + case J_JG_32: + if (size == 4) { + loc[-1] = 0x0f; + *loc = 0x8f; + } else + *loc = 0x7f; + break; + case J_JGE_32: + if (size == 4) { + loc[-1] = 0x0f; + *loc = 0x8d; + } else + *loc = 0x7d; + break; + case J_JB_32: + if (size == 4) { + loc[-1] = 0x0f; + *loc = 0x82; + } else + *loc = 0x72; + break; + case J_JBE_32: + if (size == 4) { + loc[-1] = 0x0f; + *loc = 0x86; + } else + *loc = 0x76; + break; + case J_JL_32: + if (size == 4) { + loc[-1] = 0x0f; + *loc = 0x8c; + } else + *loc = 0x7c; + break; + case J_JLE_32: + if (size == 4) { + loc[-1] = 0x0f; + *loc = 0x8e; + } else + *loc = 0x7e; + break; + case J_JA_32: + if (size == 4) { + loc[-1] = 0x0f; + *loc = 0x87; + } else + *loc = 0x77; + break; + case J_JAE_32: + if (size == 4) { + loc[-1] = 0x0f; + *loc = 0x83; + } else + *loc = 0x73; + break; + case J_UNKNOWN: + llvm_unreachable("Unknown Jump Relocation"); + } +} + void X86_64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const { switch (rel.type) { case R_X86_64_8: diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -114,6 +114,7 @@ llvm::StringRef sysroot; llvm::StringRef thinLTOCacheDir; llvm::StringRef thinLTOIndexOnlyArg; + llvm::StringRef ltoBasicBlockSections; std::pair thinLTOObjectSuffixReplace; std::pair thinLTOPrefixReplace; std::string rpath; @@ -165,6 +166,7 @@ bool ltoCSProfileGenerate; bool ltoDebugPassManager; bool ltoNewPassManager; + bool ltoUniqueBBSectionNames; bool ltoWholeProgramVisibility; bool mergeArmExidx; bool mipsN32Abi = false; @@ -175,6 +177,7 @@ bool nostdlib; bool oFormatBinary; bool omagic; + bool optimizeBBJumps; bool optRemarksWithHotness; bool picThunk; bool pie; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -878,6 +878,8 @@ config->cref = args.hasFlag(OPT_cref, OPT_no_cref, false); config->defineCommon = args.hasFlag(OPT_define_common, OPT_no_define_common, !args.hasArg(OPT_relocatable)); + config->optimizeBBJumps = + args.hasFlag(OPT_optimize_bb_jumps, OPT_no_optimize_bb_jumps, false); config->demangle = args.hasFlag(OPT_demangle, OPT_no_demangle, true); config->dependentLibraries = args.hasFlag(OPT_dependent_libraries, OPT_no_dependent_libraries, true); config->disableVerify = args.hasArg(OPT_disable_verify); @@ -924,6 +926,11 @@ config->ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq); config->ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1); config->ltoSampleProfile = args.getLastArgValue(OPT_lto_sample_profile); + config->ltoBasicBlockSections = + args.getLastArgValue(OPT_lto_basicblock_sections); + config->ltoUniqueBBSectionNames = + args.hasFlag(OPT_lto_unique_bb_section_names, + OPT_no_lto_unique_bb_section_names, false); config->mapFile = args.getLastArgValue(OPT_Map); config->mipsGotSize = args::getInteger(args, OPT_mips_got_size, 0xfff0); config->mergeArmExidx = diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h --- a/lld/ELF/InputSection.h +++ b/lld/ELF/InputSection.h @@ -128,6 +128,26 @@ return cast_or_null>(file); } + // If basic block sections are enabled, many code sections could end up with + // one or two jump instructions at the end that could be relaxed to a smaller + // instruction. The members below help trimming the trailing jump instruction + // and shrinking a section. + unsigned bytesDropped = 0; + + void drop_back(uint64_t num) { bytesDropped += num; } + + void push_back(uint64_t num) { + assert(bytesDropped >= num); + bytesDropped -= num; + } + + void trim() { + if (bytesDropped) { + rawData = rawData.drop_back(bytesDropped); + bytesDropped = 0; + } + } + ArrayRef data() const { if (uncompressedSize >= 0) uncompress(); @@ -183,12 +203,25 @@ // the mmap'ed output buffer. template void relocate(uint8_t *buf, uint8_t *bufEnd); void relocateAlloc(uint8_t *buf, uint8_t *bufEnd); + static uint64_t getRelocTargetVA(const InputFile *File, RelType Type, + int64_t A, uint64_t P, const Symbol &Sym, + RelExpr Expr); // The native ELF reloc data type is not very convenient to handle. // So we convert ELF reloc records to our own records in Relocations.cpp. // This vector contains such "cooked" relocations. std::vector relocations; + // Indicates that this section needs to be padded with a NOP filler if set to + // true. + bool nopFiller = false; + + // These are modifiers to jump instructions that are necessary when basic + // block sections are enabled. Basic block sections creates opportunities to + // relax jump instructions at basic block boundaries after reordering the + // basic blocks. + std::vector jumpInstrMods; + // A function compiled with -fsplit-stack calling a function // compiled without -fsplit-stack needs its prologue adjusted. Find // such functions and adjust their prologues. This is very similar diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -138,7 +138,7 @@ return s->getSize(); if (uncompressedSize >= 0) return uncompressedSize; - return rawData.size(); + return rawData.size() - bytesDropped; } void InputSectionBase::uncompress() const { @@ -659,8 +659,9 @@ } } -static uint64_t getRelocTargetVA(const InputFile *file, RelType type, int64_t a, - uint64_t p, const Symbol &sym, RelExpr expr) { +uint64_t InputSectionBase::getRelocTargetVA(const InputFile *file, RelType type, + int64_t a, uint64_t p, + const Symbol &sym, RelExpr expr) { switch (expr) { case R_ABS: case R_DTPREL: @@ -871,6 +872,12 @@ if (expr == R_NONE) continue; + if (expr == R_SIZE) { + target->relocateNoSym(bufLoc, type, + SignExtend64(sym.getSize() + addend)); + continue; + } + if (expr != R_ABS && expr != R_DTPREL && expr != R_RISCV_ADD) { std::string msg = getLocation(offset) + ": has non-ABS relocation " + toString(type) + @@ -942,6 +949,8 @@ const unsigned bits = config->wordsize * 8; for (const Relocation &rel : relocations) { + if (rel.expr == R_NONE) + continue; uint64_t offset = rel.offset; if (auto *sec = dyn_cast(this)) offset += sec->outSecOff; @@ -1011,6 +1020,18 @@ break; } } + + // Apply jumpInstrMods. jumpInstrMods are created when the opcode of + // a jmp insn must be modified to shrink the jmp insn or to flip the jmp + // insn. This is primarily used to relax and optimize jumps created with + // basic block sections. + if (auto *sec = dyn_cast(this)) { + for (const JumpInstrMod &jumpMod : jumpInstrMods) { + uint64_t offset = jumpMod.offset + sec->outSecOff; + uint8_t *bufLoc = buf + offset; + target->applyJumpInstrMod(bufLoc, jumpMod.original, jumpMod.size); + } + } } // For each function-defining prologue, find any calls to __morestack, diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp --- a/lld/ELF/LTO.cpp +++ b/lld/ELF/LTO.cpp @@ -76,6 +76,32 @@ c.Options.FunctionSections = true; c.Options.DataSections = true; + // Check if basic block sections must be used. + // Allowed values for --lto-basicblock-sections are "all", "labels", + // "", or none. This is the equivalent + // of -fbasicblock-sections= flag in clang. + if (!config->ltoBasicBlockSections.empty()) { + if (config->ltoBasicBlockSections == "all") { + c.Options.BBSections = BasicBlockSection::All; + } else if (config->ltoBasicBlockSections == "labels") { + c.Options.BBSections = BasicBlockSection::Labels; + } else if (config->ltoBasicBlockSections == "none") { + c.Options.BBSections = BasicBlockSection::None; + } else { + ErrorOr> MBOrErr = + MemoryBuffer::getFile(config->ltoBasicBlockSections.str()); + if (!MBOrErr) { + error("cannot open " + config->ltoBasicBlockSections + ":" + + MBOrErr.getError().message()); + } else { + c.Options.BBSectionsFuncListBuf = std::move(*MBOrErr); + } + c.Options.BBSections = BasicBlockSection::List; + } + } + + c.Options.UniqueBBSectionNames = config->ltoUniqueBBSectionNames; + if (auto relocModel = getRelocModelFromCMModel()) c.RelocModel = *relocModel; else if (config->relocatable) diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -42,6 +42,10 @@ defm defsym: Eq<"defsym", "Define a symbol alias">, MetaVarName<"=">; +defm optimize_bb_jumps: B<"optimize-bb-jumps", + "Remove direct jumps at the end to the next basic block", + "Do not remove any direct jumps at the end to the next basic block (default)">; + defm split_stack_adjust_size : Eq<"split-stack-adjust-size", "Specify adjustment to stack size when a split-stack function calls a " @@ -502,6 +506,11 @@ HelpText<"The format used for serializing remarks (default: YAML)">; defm plugin_opt: Eq<"plugin-opt", "specifies LTO options for compatibility with GNU linkers">; def save_temps: F<"save-temps">; +def lto_basicblock_sections: J<"lto-basicblock-sections=">, + HelpText<"Enable basic block sections for LTO">; +defm lto_unique_bb_section_names: B<"lto-unique-bb-section-names", + "Give unique names to every basic block section for LTO", + "Do not give unique names to every basic block section for LTO (default)">; def shuffle_sections: J<"shuffle-sections=">, MetaVarName<"">, HelpText<"Shuffle input sections using the given seed. If 0, use a random seed">; def thinlto_cache_dir: J<"thinlto-cache-dir=">, diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp --- a/lld/ELF/OutputSections.cpp +++ b/lld/ELF/OutputSections.cpp @@ -242,6 +242,25 @@ sortByOrder(isd->sections, order); } +static void nopInstrFill(uint8_t *buf, size_t size) { + if (size == 0) + return; + unsigned i = 0; + if (size == 0) + return; + std::vector> nopFiller = *target->nopInstrs; + unsigned num = size / nopFiller.back().size(); + for (unsigned c = 0; c < num; ++c) { + memcpy(buf + i, nopFiller.back().data(), nopFiller.back().size()); + i += nopFiller.back().size(); + } + unsigned remaining = size - i; + if (!remaining) + return; + assert(nopFiller[remaining - 1].size() == remaining); + memcpy(buf + i, nopFiller[remaining - 1].data(), remaining); +} + // Fill [Buf, Buf + Size) with Filler. // This is used for linker script "=fillexp" command. static void fill(uint8_t *buf, size_t size, @@ -330,7 +349,11 @@ end = buf + size; else end = buf + sections[i + 1]->outSecOff; - fill(start, end - start, filler); + if (isec->nopFiller) { + assert(target->nopInstrs); + nopInstrFill(start, end - start); + } else + fill(start, end - start, filler); } }); diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h --- a/lld/ELF/Relocations.h +++ b/lld/ELF/Relocations.h @@ -24,6 +24,7 @@ // Represents a relocation type, such as R_X86_64_PC32 or R_ARM_THM_CALL. using RelType = uint32_t; +using JumpModType = uint32_t; // List of target-independent relocation types. Relocations read // from files are converted to these types so that the main code @@ -108,6 +109,15 @@ Symbol *sym; }; +// Manipulate jump instructions with these modifiers. These are used to relax +// jump instruction opcodes at basic block boundaries and are particularly +// useful when basic block sections are enabled. +struct JumpInstrMod { + JumpModType original; + uint64_t offset; + unsigned size; +}; + // This function writes undefined symbol diagnostics to an internal buffer. // Call reportUndefinedSymbols() after calling scanRelocations() to emit // the diagnostics. diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h --- a/lld/ELF/Target.h +++ b/lld/ELF/Target.h @@ -88,8 +88,21 @@ relocate(loc, Relocation{R_NONE, type, 0, 0, nullptr}, val); } + virtual void applyJumpInstrMod(uint8_t *loc, JumpModType type, + JumpModType val) const {} + virtual ~TargetInfo(); + // This deletes a jump insn at the end of the section if it is a fall thru to + // the next section. Further, if there is a conditional jump and a direct + // jump consecutively, it tries to flip the conditional jump to convert the + // direct jump into a fall thru and delete it. Returns true if a jump + // instruction can be deleted. + virtual bool deleteFallThruJmpInsn(InputSection &is, InputFile *file, + InputSection *nextIS) const { + return false; + } + unsigned defaultCommonPageSize = 4096; unsigned defaultMaxPageSize = 4096; @@ -126,6 +139,10 @@ // executable OutputSections. std::array trapInstr; + // Stores the NOP instructions of different sizes for the target and is used + // to pad sections that are relaxed. + llvm::Optional>> nopInstrs; + // If a target needs to rewrite calls to __morestack to instead call // __morestack_non_split when a split-stack enabled caller calls a // non-split-stack callee this will return true. Otherwise returns false. diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -31,6 +31,8 @@ #include "llvm/Support/xxhash.h" #include +#define DEBUG_TYPE "lld" + using namespace llvm; using namespace llvm::ELF; using namespace llvm::object; @@ -57,6 +59,7 @@ void sortSections(); void resolveShfLinkOrder(); void finalizeAddressDependentContent(); + void optimizeBasicBlockJumps(); void sortInputSections(); void finalizeSections(); void checkExecuteOnly(); @@ -1670,6 +1673,94 @@ Twine(os->alignment) + ")"); } +// If Input Sections have been shrinked (basic block sections) then +// update symbol values and sizes associated with these sections. With basic +// block sections, input sections can shrink when the jump instructions at +// the end of the section are relaxed. +static void fixSymbolsAfterShrinking() { + for (InputFile *File : objectFiles) { + parallelForEach(File->getSymbols(), [&](Symbol *Sym) { + auto *def = dyn_cast(Sym); + if (!def) + return; + + const SectionBase *sec = def->section; + if (!sec) + return; + + const InputSectionBase *inputSec = dyn_cast(sec->repl); + if (!inputSec || !inputSec->bytesDropped) + return; + + const size_t OldSize = inputSec->data().size(); + const size_t NewSize = OldSize - inputSec->bytesDropped; + + if (def->value > NewSize && def->value <= OldSize) { + LLVM_DEBUG(llvm::dbgs() + << "Moving symbol " << Sym->getName() << " from " + << def->value << " to " + << def->value - inputSec->bytesDropped << " bytes\n"); + def->value -= inputSec->bytesDropped; + return; + } + + if (def->value + def->size > NewSize && def->value <= OldSize && + def->value + def->size <= OldSize) { + LLVM_DEBUG(llvm::dbgs() + << "Shrinking symbol " << Sym->getName() << " from " + << def->size << " to " << def->size - inputSec->bytesDropped + << " bytes\n"); + def->size -= inputSec->bytesDropped; + } + }); + } +} + +// If basic block sections exist, there are opportunities to delete fall thru +// jumps and shrink jump instructions after basic block reordering. This +// relaxation pass does that. It is only enabled when --optimize-bb-jumps +// option is used. +template void Writer::optimizeBasicBlockJumps() { + assert(config->optimizeBBJumps); + + script->assignAddresses(); + // For every output section that has executable input sections, this + // does the following: + // 1. Deletes all direct jump instructions in input sections that + // jump to the following section as it is not required. + // 2. If there are two consecutive jump instructions, it checks + // if they can be flipped and one can be deleted. + for (OutputSection *os : outputSections) { + if (!(os->flags & SHF_EXECINSTR)) + continue; + std::vector sections = getInputSections(os); + std::vector result(sections.size()); + // Delete all fall through jump instructions. Also, check if two + // consecutive jump instructions can be flipped so that a fall + // through jmp instruction can be deleted. + parallelForEachN(0, sections.size(), [&](size_t i) { + InputSection *next = i + 1 < sections.size() ? sections[i + 1] : nullptr; + InputSection &is = *sections[i]; + result[i] = + target->deleteFallThruJmpInsn(is, is.getFile(), next) ? 1 : 0; + }); + size_t numDeleted = std::count(result.begin(), result.end(), 1); + if (numDeleted > 0) { + script->assignAddresses(); + LLVM_DEBUG(llvm::dbgs() + << "Removing " << numDeleted << " fall through jumps\n"); + } + } + + fixSymbolsAfterShrinking(); + + for (OutputSection *os : outputSections) { + std::vector sections = getInputSections(os); + for (InputSection *is : sections) + is->trim(); + } +} + static void finalizeSynthetic(SyntheticSection *sec) { if (sec && sec->isNeeded() && sec->getParent()) sec->finalizeContents(); @@ -1992,6 +2083,12 @@ finalizeSynthetic(in.symTab); finalizeSynthetic(in.ppc64LongBranchTarget); + // Relaxation to delete inter-basic block jumps created by basic block + // sections. Run after in.symTab is finalized as optimizeBasicBlockJumps + // can relax jump instructions based on symbol offset. + if (config->optimizeBBJumps) + optimizeBasicBlockJumps(); + // Fill other section headers. The dynamic table is finalized // at the end because some tags like RELSZ depend on result // of finalizing other sections. diff --git a/lld/test/ELF/bb-sections-and-icf.s b/lld/test/ELF/bb-sections-and-icf.s new file mode 100644 --- /dev/null +++ b/lld/test/ELF/bb-sections-and-icf.s @@ -0,0 +1,47 @@ +# REQUIRES: x86 +## basicblock-sections tests. +## This simple test checks foo is folded into bar with bb sections +## and the jumps are deleted. + +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o +# RUN: ld.lld --optimize-bb-jumps --icf=all %t.o -o %t.out +# RUN: llvm-objdump -d %t.out| FileCheck %s + +# CHECK: : +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: je 0x{{[[:xdigit:]]+}} +# CHECK-NOT: jmp + +# CHECK: : +## Explicity check that bar is folded and not emitted. +# CHECK-NOT: : +# CHECK-NOT: : +# CHECK-NOT: : + +.section .text.bar,"ax",@progbits +.type bar,@function +bar: + nopl (%rax) + jne a.BB.bar + jmp aa.BB.bar + +.section .text.a.BB.bar,"ax",@progbits,unique,3 +a.BB.bar: + nopl (%rax) + +aa.BB.bar: + ret + +.section .text.foo,"ax",@progbits +.type foo,@function +foo: + nopl (%rax) + jne a.BB.foo + jmp aa.BB.foo + +.section .text.a.BB.foo,"ax",@progbits,unique,2 +a.BB.foo: + nopl (%rax) + +aa.BB.foo: + ret diff --git a/lld/test/ELF/bb-sections-delete-fallthru.s b/lld/test/ELF/bb-sections-delete-fallthru.s new file mode 100644 --- /dev/null +++ b/lld/test/ELF/bb-sections-delete-fallthru.s @@ -0,0 +1,128 @@ +# REQUIRES: x86 +## basicblock-sections tests. +## This simple test checks if redundant direct jumps are converted to +## implicit fallthrus. The jcc's must be converted to their inverted +## opcode, for instance jne to je and jmp must be deleted. + +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o +# RUN: ld.lld --optimize-bb-jumps %t.o -o %t.out +# RUN: llvm-objdump -d %t.out| FileCheck %s + +# CHECK: : +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: jne 0x{{[[:xdigit:]]+}} +# CHECK-NOT: jmp + + +.section .text,"ax",@progbits +.type foo,@function +foo: + nopl (%rax) + je a.BB.foo + jmp r.BB.foo + +# CHECK: : +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: je 0x{{[[:xdigit:]]+}} +# CHECK-NOT: jmp + +.section .text,"ax",@progbits,unique,3 +a.BB.foo: + nopl (%rax) + jne aa.BB.foo + jmp r.BB.foo + +# CHECK: : +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: jle 0x{{[[:xdigit:]]+}} +# CHECK-NOT: jmp +# +.section .text,"ax",@progbits,unique,4 +aa.BB.foo: + nopl (%rax) + jg aaa.BB.foo + jmp r.BB.foo + +# CHECK: : +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: jl 0x{{[[:xdigit:]]+}} +# CHECK-NOT: jmp +# +.section .text,"ax",@progbits,unique,5 +aaa.BB.foo: + nopl (%rax) + jge aaaa.BB.foo + jmp r.BB.foo + +# CHECK: : +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: jae 0x{{[[:xdigit:]]+}} +# CHECK-NOT: jmp +# +.section .text,"ax",@progbits,unique,6 +aaaa.BB.foo: + nopl (%rax) + jb aaaaa.BB.foo + jmp r.BB.foo + +# CHECK: : +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: ja 0x{{[[:xdigit:]]+}} +# CHECK-NOT: jmp +# +.section .text,"ax",@progbits,unique,7 +aaaaa.BB.foo: + nopl (%rax) + jbe aaaaaa.BB.foo + jmp r.BB.foo + +# CHECK: : +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: jge 0x{{[[:xdigit:]]+}} +# CHECK-NOT: jmp +# +.section .text,"ax",@progbits,unique,8 +aaaaaa.BB.foo: + nopl (%rax) + jl aaaaaaa.BB.foo + jmp r.BB.foo + +# CHECK: : +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: jg 0x{{[[:xdigit:]]+}} +# CHECK-NOT: jmp +# +.section .text,"ax",@progbits,unique,9 +aaaaaaa.BB.foo: + nopl (%rax) + jle aaaaaaaa.BB.foo + jmp r.BB.foo + +# CHECK: : +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: jbe 0x{{[[:xdigit:]]+}} +# CHECK-NOT: jmp +# +.section .text,"ax",@progbits,unique,10 +aaaaaaaa.BB.foo: + nopl (%rax) + ja aaaaaaaaa.BB.foo + jmp r.BB.foo + +# CHECK: : +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: jb 0x{{[[:xdigit:]]+}} +# CHECK-NOT: jmp +# +.section .text,"ax",@progbits,unique,11 +aaaaaaaaa.BB.foo: + nopl (%rax) + jae aaaaaaaaaa.BB.foo + jmp r.BB.foo + +.section .text,"ax",@progbits,unique,20 +aaaaaaaaaa.BB.foo: + nopl (%rax) + +r.BB.foo: + ret diff --git a/lld/test/ELF/bb-sections-pc32reloc.s b/lld/test/ELF/bb-sections-pc32reloc.s new file mode 100644 --- /dev/null +++ b/lld/test/ELF/bb-sections-pc32reloc.s @@ -0,0 +1,37 @@ +# REQUIRES: x86 +## basicblock-sections tests. +## This simple test checks if redundant direct jumps are converted to +## implicit fallthrus when PC32 reloc is present. The jcc's must be converted +## to their inverted opcode, for instance jne to je and jmp must be deleted. + +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o +# RUN: llvm-objdump -dr %t.o| FileCheck %s --check-prefix=RELOC +# RUN: ld.lld --optimize-bb-jumps %t.o -o %t.out +# RUN: llvm-objdump -d %t.out| FileCheck %s + +# RELOC: jmp +# RELOC-NEXT: R_X86_64_PC32 + +# CHECK: : +# CHECK-NEXT: nopl (%rax) +# CHECK-NEXT: jne 0x{{[[:xdigit:]]+}} +# CHECK-NOT: jmp + + +.section .text,"ax",@progbits +.type foo,@function +foo: + nopl (%rax) + je a.BB.foo +# Encode a jmp r.BB.foo insn using a PC32 reloc + .byte 0xe9 + .long r.BB.foo - . - 4 + +# CHECK: : +# CHECK-NEXT: nopl (%rax) + +.section .text,"ax",@progbits,unique,3 +a.BB.foo: + nopl (%rax) +r.BB.foo: + ret