Index: llvm/include/llvm/MC/MCAsmBackend.h =================================================================== --- llvm/include/llvm/MC/MCAsmBackend.h +++ llvm/include/llvm/MC/MCAsmBackend.h @@ -144,6 +144,15 @@ virtual bool mayNeedRelaxation(const MCInst &Inst, const MCSubtargetInfo &STI) const = 0; + /// Return true if the given instruction can be further relaxed. This is + /// different than mayNeedFurther relaxation in that it is allowed to return + /// true when relaxation is possible, but not required. + virtual bool canBeFurtherRelaxed(const MCInst &Inst, + const MCSubtargetInfo &STI) const { + return mayNeedRelaxation(Inst, STI); + } + + /// Target specific predicate for whether a given fixup requires the /// associated instruction to be relaxed. virtual bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved, Index: llvm/include/llvm/MC/MCAssembler.h =================================================================== --- llvm/include/llvm/MC/MCAssembler.h +++ llvm/include/llvm/MC/MCAssembler.h @@ -190,6 +190,9 @@ /// if any offsets were adjusted. bool layoutSectionOnce(MCAsmLayout &Layout, MCSection &Sec); + /// Perform relaxation on a single fragment - returns true if the fragment + /// changes as a result of relaxation. + bool relaxFragment(MCAsmLayout &Layout, MCFragment &F); bool relaxInstruction(MCAsmLayout &Layout, MCRelaxableFragment &IF); bool relaxLEB(MCAsmLayout &Layout, MCLEBFragment &IF); bool relaxBoundaryAlign(MCAsmLayout &Layout, MCBoundaryAlignFragment &BF); @@ -200,6 +203,10 @@ MCCVInlineLineTableFragment &DF); bool relaxCVDefRange(MCAsmLayout &Layout, MCCVDefRangeFragment &DF); + /// Once relaxation is complete, try to reduce number of nops required + /// without requiring any further relaxation. + void optimizeLayout(MCAsmLayout &Layout); + /// finishLayout - Finalize a layout, including fragment lowering. void finishLayout(MCAsmLayout &Layout); Index: llvm/include/llvm/MC/MCFragment.h =================================================================== --- llvm/include/llvm/MC/MCFragment.h +++ llvm/include/llvm/MC/MCFragment.h @@ -274,6 +274,9 @@ } }; +/// Represents alignment for either code or data. For code, the Value and +/// ValueSize fields are ignored; nops are always emitted. For date, the +/// Value is repeated ValueSize times. Value is usually 0. class MCAlignFragment : public MCFragment { /// The alignment to ensure, in bytes. unsigned Alignment; Index: llvm/lib/MC/MCAssembler.cpp =================================================================== --- llvm/lib/MC/MCAssembler.cpp +++ llvm/lib/MC/MCAssembler.cpp @@ -791,6 +791,12 @@ errs() << "assembler backend - post-relaxation\n--\n"; dump(); }); + optimizeLayout(Layout); + + DEBUG_WITH_TYPE("mc-dump", { + errs() << "assembler backend - post-optimization\n--\n"; + dump(); }); + // Finalize the layout, including fragment lowering. finishLayout(Layout); @@ -1014,11 +1020,8 @@ uint64_t NewSize = needPadding(AlignedOffset, AlignedSize, BoundaryAlignment) ? offsetToAlignment(AlignedOffset, BoundaryAlignment) : 0U; - if (NewSize == OldSize) - return false; BF.setSize(NewSize); - Layout.invalidateFragmentsFrom(&BF); - return true; + return (NewSize != OldSize); } bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout, @@ -1106,6 +1109,30 @@ return OldSize != F.getContents().size(); } +bool MCAssembler::relaxFragment(MCAsmLayout &Layout, MCFragment &F) { + switch(F.getKind()) { + default: + return false; + case MCFragment::FT_Relaxable: + assert(!getRelaxAll() && + "Did not expect a MCRelaxableFragment in RelaxAll mode"); + return relaxInstruction(Layout, cast(F)); + case MCFragment::FT_Dwarf: + return relaxDwarfLineAddr(Layout, cast(F)); + case MCFragment::FT_DwarfFrame: + return relaxDwarfCallFrameFragment(Layout, + cast(F)); + case MCFragment::FT_LEB: + return relaxLEB(Layout, cast(F)); + case MCFragment::FT_BoundaryAlign: + return relaxBoundaryAlign(Layout, cast(F)); + case MCFragment::FT_CVInlineLines: + return relaxCVInlineLineTable(Layout, cast(F)); + case MCFragment::FT_CVDefRange: + return relaxCVDefRange(Layout, cast(F)); + } +} + bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSection &Sec) { // Holds the first fragment which needed relaxing during this layout. It will // remain NULL if none were relaxed. @@ -1116,39 +1143,7 @@ // Attempt to relax all the fragments in the section. for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) { // Check if this is a fragment that needs relaxation. - bool RelaxedFrag = false; - switch(I->getKind()) { - default: - break; - case MCFragment::FT_Relaxable: - assert(!getRelaxAll() && - "Did not expect a MCRelaxableFragment in RelaxAll mode"); - RelaxedFrag = relaxInstruction(Layout, *cast(I)); - break; - case MCFragment::FT_Dwarf: - RelaxedFrag = relaxDwarfLineAddr(Layout, - *cast(I)); - break; - case MCFragment::FT_DwarfFrame: - RelaxedFrag = - relaxDwarfCallFrameFragment(Layout, - *cast(I)); - break; - case MCFragment::FT_LEB: - RelaxedFrag = relaxLEB(Layout, *cast(I)); - break; - case MCFragment::FT_BoundaryAlign: - RelaxedFrag = - relaxBoundaryAlign(Layout, *cast(I)); - break; - case MCFragment::FT_CVInlineLines: - RelaxedFrag = - relaxCVInlineLineTable(Layout, *cast(I)); - break; - case MCFragment::FT_CVDefRange: - RelaxedFrag = relaxCVDefRange(Layout, *cast(I)); - break; - } + bool RelaxedFrag = relaxFragment(Layout, *I); if (RelaxedFrag && !FirstRelaxedFragment) FirstRelaxedFragment = &*I; } @@ -1172,6 +1167,125 @@ return WasRelaxed; } +void MCAssembler::optimizeLayout(MCAsmLayout &Layout) { + // See if we can further relax some instructions to cut down on the number of + // nop bytes required for code alignment. The actual win is in reducing + // instruction count, not number of bytes. Some micro-architectures (such + // as, say, modern X86-64) can easily end up decode limited. It is + // often better to reduce the number of instructions (i.e. eliminate nops) + // even at the cost of increasing the size and complexity of others. + + DenseSet LabeledFragments; + for (const MCSymbol &S : symbols()) + LabeledFragments.insert(S.getFragment(false)); + + for (iterator it = begin(), ie = end(); it != ie; ++it) { + MCSection &Sec = *it; + if (!Sec.getKind().isText()) + continue; + + SmallVector Relaxable; + for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) { + MCFragment &F = *I; + + if (LabeledFragments.count(&F)) + Relaxable.clear(); + + if (F.getKind() == MCFragment::FT_Data) + // Skip and ignore + continue; + + if (F.getKind() == MCFragment::FT_Relaxable) { + auto &RF = cast(*I); + if (getBackend().canBeFurtherRelaxed(RF.getInst(), + *RF.getSubtargetInfo())) + Relaxable.push_back(&RF); + // Note: It's okay to skip an instruction which can't be further + // relaxed as it a) will be a fixed number of bytes, and b) must be + // able to encode any larger offsets which result from shifting it + // around. + continue; + } + + // For any unhandled kind, assume we can't change layout. + if (F.getKind() != MCFragment::FT_Align) { + Relaxable.clear(); + continue; + } + const MCAlignFragment &AF = cast(F); + + const unsigned OrigOffset = Layout.getFragmentOffset(&F); + const unsigned OrigSize = computeFragmentSize(Layout, F); + + // unsigned Offset = Layout.getFragmentOffset(&AF); + unsigned Size = computeFragmentSize(Layout, AF); + // unsigned Size = offsetToAlignment(Offset, Align(AF.getAlignment())); + if (Size == 0) + continue; + if (Relaxable.empty()) + continue; + // To keep the effects local, prefer to relax instructions closest to + // the align directive. This is purely about human understandability + // of the resulting code. If we later find a reason to expand + // particular instructions over others, we can adjust. + MCFragment *FirstChangedFragment = nullptr; + while (!Relaxable.empty()) { + auto *RF = Relaxable.pop_back_val(); + + MCInst Relaxed; + getBackend().relaxInstruction(RF->getInst(), *RF->getSubtargetInfo(), + Relaxed); + SmallVector Fixups; + SmallString<256> Code; + raw_svector_ostream VecOS(Code); + getEmitter().encodeInstruction(Relaxed, VecOS, Fixups, + *RF->getSubtargetInfo()); + unsigned OldSize = RF->getContents().size(); + unsigned NewSize = Code.size(); + assert(NewSize >= OldSize && "size decrease during relaxation?"); + unsigned Delta = NewSize - OldSize; + if (Delta > Size) { + // Too big, can't use this relocation without requiring another + // round of relaxation. Subtly, we can't skip past this one either + // as we *haven't* relaxed it. Changing it's starting offset might + // require a larger negative offset than it can encode. We don't + // need to worry about larger positive offsets as none of the + // possible offsets between this and our align are visible, and the + // ones afterwards aren't changing. + break; + } + //dbgs() << "Increased padding by " << Delta << "\n"; + Size -= Delta; + RF->setInst(Relaxed); + RF->getContents() = Code; + RF->getFixups() = Fixups; + FirstChangedFragment = RF; + if (Size == 0) + break; + } + Relaxable.clear(); + if (FirstChangedFragment) { + // Redo the layout for any fragements in the effected range. This is + // mostly updating offsets, but also may need to apply other updates + // (such as changing offsets) to the fragments in question. Note that + // the relaxation itself has already been done above, and thus the + // total size of the range isn't changing. + Layout.invalidateFragmentsFrom(FirstChangedFragment); + while (FirstChangedFragment != &F) { + relaxFragment(Layout, *FirstChangedFragment); + FirstChangedFragment = FirstChangedFragment->getNextNode(); + } + } + const unsigned FinalOffset = Layout.getFragmentOffset(&F); + const unsigned FinalSize = computeFragmentSize(Layout, F); + assert(OrigOffset + OrigSize == FinalOffset + FinalSize && + "can't move start of next fragment!"); + + } + } +} + + void MCAssembler::finishLayout(MCAsmLayout &Layout) { assert(getBackendPtr() && "Expected assembler backend"); // The layout is done. Mark every fragment as valid. Index: llvm/lib/MC/MCFragment.cpp =================================================================== --- llvm/lib/MC/MCFragment.cpp +++ llvm/lib/MC/MCFragment.cpp @@ -394,6 +394,7 @@ OS << "\n "; OS << " Inst:"; F->getInst().dump_pretty(OS); + OS << " (" << F->getContents().size() << ")"; break; } case MCFragment::FT_Org: { Index: llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp =================================================================== --- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -168,6 +168,9 @@ bool mayNeedRelaxation(const MCInst &Inst, const MCSubtargetInfo &STI) const override; + bool canBeFurtherRelaxed(const MCInst &Inst, + const MCSubtargetInfo &STI) const override; + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override; @@ -603,6 +606,15 @@ return false; } +bool X86AsmBackend::canBeFurtherRelaxed(const MCInst &Inst, + const MCSubtargetInfo &STI) const { + // TODO: There are lots of other tricks we could apply for increasing + // encoding size without impacting performance. + bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit]; + return getRelaxedOpcode(Inst, is16BitMode) != Inst.getOpcode(); +} + + bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, @@ -616,6 +628,8 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, MCInst &Res) const { + assert(canBeFurtherRelaxed(Inst, STI)); + // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel. bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit]; unsigned RelaxedOp = getRelaxedOpcode(Inst, is16BitMode); Index: llvm/test/MC/X86/align-via-relaxation.s =================================================================== --- /dev/null +++ llvm/test/MC/X86/align-via-relaxation.s @@ -0,0 +1,74 @@ + # RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu %s | llvm-objdump -d --section=.text - | FileCheck %s + + + .file "test.c" + .text + .section .text + # Demonstrate that we can relax instructions to provide padding, not + # just insert nops. jmps are being used for ease of demonstration. + # CHECK: .text + # CHECK: 0: eb 1f jmp 31 + # CHECK: 2: e9 1a 00 00 00 jmp 26 + # CHECK: 7: e9 15 00 00 00 jmp 21 + # CHECK: c: e9 10 00 00 00 jmp 16 + # CHECK: 11: e9 0b 00 00 00 jmp 11 + # CHECK: 16: e9 06 00 00 00 jmp 6 + # CHECK: 1b: e9 01 00 00 00 jmp 1 + # CHECK: 20: cc int3 + .p2align 4 + jmp foo + jmp foo + jmp foo + jmp foo + jmp foo + jmp foo + jmp foo + .p2align 5 + int3 +foo: + ret + + # Check that we're not shifting aroudn the offsets of labels - doing + # that would require a further round of relaxation + # CHECK: bar: + # CHECK: 22: eb fe jmp -2 + # CHECK: 24: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax) + # CHECK: 2e: 66 90 nop + # CHECK: 30: 0f 0b ud2 + +bar: + jmp bar +nobypass: + .p2align 4 + ud2 + + + # Canonical toy loop to show benefit - we can align the loop header with + # fewer nops by relaxing the branch, even though we don't need to + # CHECK: loop_preheader: + # CHECK: 45: 48 85 c0 testq %rax, %rax + # CHECK: 48: 0f 8e 22 00 00 00 jle 34 + # CHECK: 4e: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax) + # CHECK: 58: 0f 1f 84 00 00 00 00 00 nopl (%rax,%rax) + # CHECK: loop_header: + # CHECK: 60: 48 29 04 25 01 00 00 00 subq %rax, 1 + # CHECK: 68: 48 85 c0 testq %rax, %rax + # CHECK: 6b: 7e 03 jle 3 + # CHECK: 6d: eb f1 jmp -15 + # CHECK: 6f: 90 nop + # CHECK: loop_exit: + # CHECK: 70: c3 retq + .p2align 5 + .skip 5 +loop_preheader: + testq %rax, %rax + jle loop_exit + .p2align 5 +loop_header: + subq %rax, 1 + testq %rax, %rax + jle loop_exit + jmp loop_header + .p2align 4 +loop_exit: + ret