Index: llvm/include/llvm/MC/MCAsmBackend.h =================================================================== --- llvm/include/llvm/MC/MCAsmBackend.h +++ llvm/include/llvm/MC/MCAsmBackend.h @@ -144,6 +144,15 @@ virtual bool mayNeedRelaxation(const MCInst &Inst, const MCSubtargetInfo &STI) const = 0; + /// Return true if the given instruction can be further relaxed. This is + /// different than mayNeedFurther relaxation in that it is allowed to return + /// true when relaxation is possible, but not required. + virtual bool canBeFurtherRelaxed(const MCInst &Inst, + const MCSubtargetInfo &STI) const { + return mayNeedRelaxation(Inst, STI); + } + + /// Target specific predicate for whether a given fixup requires the /// associated instruction to be relaxed. virtual bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved, Index: llvm/include/llvm/MC/MCAssembler.h =================================================================== --- llvm/include/llvm/MC/MCAssembler.h +++ llvm/include/llvm/MC/MCAssembler.h @@ -203,6 +203,10 @@ MCCVInlineLineTableFragment &DF); bool relaxCVDefRange(MCAsmLayout &Layout, MCCVDefRangeFragment &DF); + /// Once relaxation is complete, try to reduce number of nops required + /// without requiring any further relaxation. + void optimizeLayout(MCAsmLayout &Layout); + /// finishLayout - Finalize a layout, including fragment lowering. void finishLayout(MCAsmLayout &Layout); Index: llvm/include/llvm/MC/MCFragment.h =================================================================== --- llvm/include/llvm/MC/MCFragment.h +++ llvm/include/llvm/MC/MCFragment.h @@ -274,6 +274,9 @@ } }; +/// Represents alignment for either code or data. For code, the Value and +/// ValueSize fields are ignored; nops are always emitted. For date, the +/// Value is repeated ValueSize times. Value is usually 0. class MCAlignFragment : public MCFragment { /// The alignment to ensure, in bytes. unsigned Alignment; Index: llvm/lib/MC/MCAssembler.cpp =================================================================== --- llvm/lib/MC/MCAssembler.cpp +++ llvm/lib/MC/MCAssembler.cpp @@ -791,6 +791,12 @@ errs() << "assembler backend - post-relaxation\n--\n"; dump(); }); + optimizeLayout(Layout); + + DEBUG_WITH_TYPE("mc-dump", { + errs() << "assembler backend - post-optimization\n--\n"; + dump(); }); + // Finalize the layout, including fragment lowering. finishLayout(Layout); @@ -1014,11 +1020,8 @@ uint64_t NewSize = needPadding(AlignedOffset, AlignedSize, BoundaryAlignment) ? offsetToAlignment(AlignedOffset, BoundaryAlignment) : 0U; - if (NewSize == OldSize) - return false; BF.setSize(NewSize); - Layout.invalidateFragmentsFrom(&BF); - return true; + return (NewSize != OldSize); } bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout, @@ -1164,6 +1167,125 @@ return WasRelaxed; } +void MCAssembler::optimizeLayout(MCAsmLayout &Layout) { + // See if we can further relax some instructions to cut down on the number of + // nop bytes required for code alignment. The actual win is in reducing + // instruction count, not number of bytes. Some micro-architectures (such + // as, say, modern X86-64) can easily end up decode limited. It is + // often better to reduce the number of instructions (i.e. eliminate nops) + // even at the cost of increasing the size and complexity of others. + + DenseSet LabeledFragments; + for (const MCSymbol &S : symbols()) + LabeledFragments.insert(S.getFragment(false)); + + for (iterator it = begin(), ie = end(); it != ie; ++it) { + MCSection &Sec = *it; + if (!Sec.getKind().isText()) + continue; + + SmallVector Relaxable; + for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) { + MCFragment &F = *I; + + if (LabeledFragments.count(&F)) + Relaxable.clear(); + + if (F.getKind() == MCFragment::FT_Data) + // Skip and ignore + continue; + + if (F.getKind() == MCFragment::FT_Relaxable) { + auto &RF = cast(*I); + if (getBackend().canBeFurtherRelaxed(RF.getInst(), + *RF.getSubtargetInfo())) + Relaxable.push_back(&RF); + // Note: It's okay to skip an instruction which can't be further + // relaxed as it a) will be a fixed number of bytes, and b) must be + // able to encode any larger offsets which result from shifting it + // around. + continue; + } + + // For any unhandled kind, assume we can't change layout. + if (F.getKind() != MCFragment::FT_Align && + F.getKind() != MCFragment::FT_BoundaryAlign) { + Relaxable.clear(); + continue; + } + const unsigned OrigOffset = Layout.getFragmentOffset(&F); + const unsigned OrigSize = computeFragmentSize(Layout, F); + if (OrigSize == 0 || Relaxable.empty()) { + Relaxable.clear(); + continue; + } + + // To keep the effects local, prefer to relax instructions closest to + // the align directive. This is purely about human understandability + // of the resulting code. If we later find a reason to expand + // particular instructions over others, we can adjust. + MCFragment *FirstChangedFragment = nullptr; + unsigned RemainingSize = OrigSize; + while (!Relaxable.empty() && RemainingSize != 0) { + auto *RF = Relaxable.pop_back_val(); + + MCInst Relaxed; + getBackend().relaxInstruction(RF->getInst(), *RF->getSubtargetInfo(), + Relaxed); + SmallVector Fixups; + SmallString<256> Code; + raw_svector_ostream VecOS(Code); + getEmitter().encodeInstruction(Relaxed, VecOS, Fixups, + *RF->getSubtargetInfo()); + const unsigned OldSize = RF->getContents().size(); + const unsigned NewSize = Code.size(); + assert(NewSize >= OldSize && "size decrease during relaxation?"); + unsigned Delta = NewSize - OldSize; + if (Delta > RemainingSize) { + // Too big, can't use this relocation without requiring another + // round of relaxation. Subtly, we can't skip past this one either + // as we *haven't* relaxed it. Changing it's starting offset might + // require a larger negative offset than it can encode. We don't + // need to worry about larger positive offsets as none of the + // possible offsets between this and our align are visible, and the + // ones afterwards aren't changing. + break; + } + RemainingSize -= Delta; + RF->setInst(Relaxed); + RF->getContents() = Code; + RF->getFixups() = Fixups; + FirstChangedFragment = RF; + } + Relaxable.clear(); + + // Unlike align, boundary align tracks it's own size after relaxation. + if (F.getKind() == MCFragment::FT_BoundaryAlign) + cast(F).setSize(RemainingSize); + + if (FirstChangedFragment) { + // Redo the layout for any fragements in the effected range. This is + // mostly updating start offsets, but also may need to apply other + // updates (such as changing offsets) to the fragments in question. + // Note that the relaxation itself has already been done above, and + // thus the total size of the range isn't changing. + Layout.invalidateFragmentsFrom(FirstChangedFragment); + while (FirstChangedFragment != &F) { + relaxFragment(Layout, *FirstChangedFragment); + FirstChangedFragment = FirstChangedFragment->getNextNode(); + } + } + + const unsigned FinalOffset = Layout.getFragmentOffset(&F); + const unsigned FinalSize = computeFragmentSize(Layout, F); + assert(OrigOffset + OrigSize == FinalOffset + FinalSize && + "can't move start of next fragment!"); + assert(FinalSize == RemainingSize && "inconsistent size computation?"); + } + } +} + + void MCAssembler::finishLayout(MCAsmLayout &Layout) { assert(getBackendPtr() && "Expected assembler backend"); // The layout is done. Mark every fragment as valid. Index: llvm/lib/MC/MCFragment.cpp =================================================================== --- llvm/lib/MC/MCFragment.cpp +++ llvm/lib/MC/MCFragment.cpp @@ -394,6 +394,7 @@ OS << "\n "; OS << " Inst:"; F->getInst().dump_pretty(OS); + OS << " (" << F->getContents().size() << ")"; break; } case MCFragment::FT_Org: { Index: llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp =================================================================== --- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -168,6 +168,9 @@ bool mayNeedRelaxation(const MCInst &Inst, const MCSubtargetInfo &STI) const override; + bool canBeFurtherRelaxed(const MCInst &Inst, + const MCSubtargetInfo &STI) const override; + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override; @@ -603,6 +606,15 @@ return false; } +bool X86AsmBackend::canBeFurtherRelaxed(const MCInst &Inst, + const MCSubtargetInfo &STI) const { + // TODO: There are lots of other tricks we could apply for increasing + // encoding size without impacting performance. + bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit]; + return getRelaxedOpcode(Inst, is16BitMode) != Inst.getOpcode(); +} + + bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, @@ -616,6 +628,8 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, MCInst &Res) const { + assert(canBeFurtherRelaxed(Inst, STI)); + // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel. bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit]; unsigned RelaxedOp = getRelaxedOpcode(Inst, is16BitMode); Index: llvm/test/MC/X86/align-branch-64.s =================================================================== --- llvm/test/MC/X86/align-branch-64.s +++ llvm/test/MC/X86/align-branch-64.s @@ -103,6 +103,24 @@ bar: retq + # CHECK: test_pad_via_relax: + # CHECK: 200: testq + # CHECK: 203: jne + # CHECK: 209: int3 + # note 6 byte jne which could be a 2 byte jne, but is instead + # expanded for padding purposes + # CHECK-NOT: nop + # CHECK: 220: callq + .global test_pad_via_relax + .p2align 5 +test_pad_via_relax: + testq %rax, %rax + jnz bar + .rept 23 + int3 + .endr + callq bar + .section "unknown" .p2align 4 .type baz,@function Index: llvm/test/MC/X86/align-via-relaxation.s =================================================================== --- /dev/null +++ llvm/test/MC/X86/align-via-relaxation.s @@ -0,0 +1,74 @@ + # RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu %s | llvm-objdump -d --section=.text - | FileCheck %s + + + .file "test.c" + .text + .section .text + # Demonstrate that we can relax instructions to provide padding, not + # just insert nops. jmps are being used for ease of demonstration. + # CHECK: .text + # CHECK: 0: eb 1f jmp 31 + # CHECK: 2: e9 1a 00 00 00 jmp 26 + # CHECK: 7: e9 15 00 00 00 jmp 21 + # CHECK: c: e9 10 00 00 00 jmp 16 + # CHECK: 11: e9 0b 00 00 00 jmp 11 + # CHECK: 16: e9 06 00 00 00 jmp 6 + # CHECK: 1b: e9 01 00 00 00 jmp 1 + # CHECK: 20: cc int3 + .p2align 4 + jmp foo + jmp foo + jmp foo + jmp foo + jmp foo + jmp foo + jmp foo + .p2align 5 + int3 +foo: + ret + + # Check that we're not shifting aroudn the offsets of labels - doing + # that would require a further round of relaxation + # CHECK: bar: + # CHECK: 22: eb fe jmp -2 + # CHECK: 24: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax) + # CHECK: 2e: 66 90 nop + # CHECK: 30: 0f 0b ud2 + +bar: + jmp bar +nobypass: + .p2align 4 + ud2 + + + # Canonical toy loop to show benefit - we can align the loop header with + # fewer nops by relaxing the branch, even though we don't need to + # CHECK: loop_preheader: + # CHECK: 45: 48 85 c0 testq %rax, %rax + # CHECK: 48: 0f 8e 22 00 00 00 jle 34 + # CHECK: 4e: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax) + # CHECK: 58: 0f 1f 84 00 00 00 00 00 nopl (%rax,%rax) + # CHECK: loop_header: + # CHECK: 60: 48 83 e8 01 subq $1, %rax + # CHECK: 64: 48 85 c0 testq %rax, %rax + # CHECK: 67: 7e 07 jle 7 + # CHECK: 69: e9 f2 ff ff ff jmp -14 + # CHECK: 6e: 66 90 nop + # CHECK: loop_exit: + # CHECK: 70: c3 retq + .p2align 5 + .skip 5 +loop_preheader: + testq %rax, %rax + jle loop_exit + .p2align 5 +loop_header: + subq $1, %rax + testq %rax, %rax + jle loop_exit + jmp loop_header + .p2align 4 +loop_exit: + ret