Index: llvm/include/llvm/MC/MCAsmBackend.h =================================================================== --- llvm/include/llvm/MC/MCAsmBackend.h +++ llvm/include/llvm/MC/MCAsmBackend.h @@ -23,6 +23,7 @@ class MCAsmLayout; class MCAssembler; class MCCFIInstruction; +class MCCodeEmitter; struct MCFixupKindInfo; class MCFragment; class MCInst; @@ -168,6 +169,14 @@ /// @} + /// Expand the instruction encoding of RF up to RemainingSize bytes. This is + /// used as an alternative padding mechanism to nops for alignment purposes. + virtual bool padInstructionEncoding(MCRelaxableFragment &RF, + MCCodeEmitter &Emitter, + unsigned &RemainingSize) { + return false; + } + /// Returns the minimum size of a nop in bytes on this target. The assembler /// will use this to emit excess padding in situations where the padding /// required for simple alignment would be less than the minimum nop size. Index: llvm/include/llvm/MC/MCAssembler.h =================================================================== --- llvm/include/llvm/MC/MCAssembler.h +++ llvm/include/llvm/MC/MCAssembler.h @@ -202,6 +202,10 @@ MCCVInlineLineTableFragment &DF); bool relaxCVDefRange(MCAsmLayout &Layout, MCCVDefRangeFragment &DF); + /// Once relaxation is complete, try to reduce number of nops required + /// without requiring any further relaxation. + void optimizeLayout(MCAsmLayout &Layout); + /// finishLayout - Finalize a layout, including fragment lowering. void finishLayout(MCAsmLayout &Layout); Index: llvm/lib/MC/MCAssembler.cpp =================================================================== --- llvm/lib/MC/MCAssembler.cpp +++ llvm/lib/MC/MCAssembler.cpp @@ -848,6 +848,13 @@ errs() << "assembler backend - post-relaxation\n--\n"; dump(); }); + optimizeLayout(Layout); + + DEBUG_WITH_TYPE("mc-dump", { + errs() << "assembler backend - post-optimization\n--\n"; + dump(); + }); + // Finalize the layout, including fragment lowering. finishLayout(Layout); @@ -1152,6 +1159,111 @@ return WasRelaxed; } +void MCAssembler::optimizeLayout(MCAsmLayout &Layout) { + // See if we can further relax some instructions to cut down on the number of + // nop bytes required for code alignment. The actual win is in reducing + // instruction count, not number of bytes. Some micro-architectures (such + // as, say, modern X86-64) can easily end up decode limited. It is + // often better to reduce the number of instructions (i.e. eliminate nops) + // even at the cost of increasing the size and complexity of others. + + DenseSet LabeledFragments; + for (const MCSymbol &S : symbols()) + LabeledFragments.insert(S.getFragment(false)); + + for (MCSection &Sec : *this) { + if (!Sec.getKind().isText()) + continue; + + SmallVector Relaxable; + for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) { + MCFragment &F = *I; + + if (LabeledFragments.count(&F)) + Relaxable.clear(); + + if (F.getKind() == MCFragment::FT_Data || + F.getKind() == MCFragment::FT_CompactEncodedInst) + // Skip and ignore + continue; + + if (F.getKind() == MCFragment::FT_Relaxable) { + auto &RF = cast(*I); + Relaxable.push_back(&RF); + continue; + } + + // For any unhandled kind, assume we can't change layout. + if (F.getKind() != MCFragment::FT_Align && + F.getKind() != MCFragment::FT_BoundaryAlign) { + Relaxable.clear(); + continue; + } + + const uint64_t OrigOffset = Layout.getFragmentOffset(&F); + const uint64_t OrigSize = computeFragmentSize(Layout, F); + if (OrigSize == 0 || Relaxable.empty()) { + Relaxable.clear(); + continue; + } + + // To keep the effects local, prefer to relax instructions closest to + // the align directive. This is purely about human understandability + // of the resulting code. If we later find a reason to expand + // particular instructions over others, we can adjust. + MCFragment *FirstChangedFragment = nullptr; + unsigned RemainingSize = OrigSize; + while (!Relaxable.empty() && RemainingSize != 0) { + auto &RF = *Relaxable.pop_back_val(); + // Give the backend a chance to play any tricks it wishes to increase + // the encoding size of the given instruction. Target independent code + // will try further relaxation, but target's may play further tricks. + if (getBackend().padInstructionEncoding(RF, getEmitter(), + RemainingSize)) + FirstChangedFragment = &RF; + + // If we have an instruction which hasn't been fully relaxed, we can't + // skip past it and insert bytes before it. Changing it's starting + // offset might require a larger negative offset than it can encode. + // We don't need to worry about larger positive offsets as none of the + // possible offsets between this and our align are visible, and the + // ones afterwards aren't changing. + if (getBackend().mayNeedRelaxation(RF.getInst(), + *RF.getSubtargetInfo())) + break; + } + Relaxable.clear(); + + if (FirstChangedFragment) { + // Make sure the offsets for any fragments in the effected range get + // updated. Note that this (conservatively) invalidates the offsets of + // those following, but this is not required. + Layout.invalidateFragmentsFrom(FirstChangedFragment); + } + + const uint64_t FinalOffset = Layout.getFragmentOffset(&F); + const uint64_t FinalSize = computeFragmentSize(Layout, F); + assert(OrigOffset + OrigSize == FinalOffset + FinalSize && + "can't move start of next fragment!"); + assert(FinalSize == RemainingSize && "inconsistent size computation?"); + + // If we're looking at a boundary align, make sure we don't try to pad a + // it's target instructions for some following directive. Doing so would + // break the alignment of the current boundary align. + if (F.getKind() == MCFragment::FT_BoundaryAlign) { + auto &BF = cast(F); + const MCFragment *F = BF.getNextNode(); + // If the branch is unfused, it is emitted into one fragment, otherwise + // it is emitted into two fragments at most, the next + // MCBoundaryAlignFragment(if exists) also marks the end of the branch. + for (int i = 0, N = BF.isFused() ? 2 : 1; + i != N && !isa(F); + ++i, F = F->getNextNode(), I++) {} + } + } + } +} + void MCAssembler::finishLayout(MCAsmLayout &Layout) { assert(getBackendPtr() && "Expected assembler backend"); // The layout is done. Mark every fragment as valid. Index: llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp =================================================================== --- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -13,6 +13,7 @@ #include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCELFObjectWriter.h" @@ -173,6 +174,9 @@ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, MCInst &Res) const override; + bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter, + unsigned &RemainingSize) override; + bool writeNopData(raw_ostream &OS, uint64_t Count) const override; }; } // end anonymous namespace @@ -639,6 +643,41 @@ Res.setOpcode(RelaxedOp); } +static bool canBeRelaxedForPadding(const MCRelaxableFragment &RF) { + // TODO: There are lots of other tricks we could apply for increasing + // encoding size without impacting performance. + auto &Inst = RF.getInst(); + auto &STI = *RF.getSubtargetInfo(); + bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit]; + return getRelaxedOpcode(Inst, is16BitMode) != Inst.getOpcode(); +} + +bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF, + MCCodeEmitter &Emitter, + unsigned &RemainingSize) { + if (!canBeRelaxedForPadding(RF)) + return false; + + MCInst Relaxed; + relaxInstruction(RF.getInst(), *RF.getSubtargetInfo(), Relaxed); + + SmallVector Fixups; + SmallString<15> Code; + raw_svector_ostream VecOS(Code); + Emitter.encodeInstruction(Relaxed, VecOS, Fixups, *RF.getSubtargetInfo()); + const unsigned OldSize = RF.getContents().size(); + const unsigned NewSize = Code.size(); + assert(NewSize >= OldSize && "size decrease during relaxation?"); + unsigned Delta = NewSize - OldSize; + if (Delta > RemainingSize) + return false; + RF.setInst(Relaxed); + RF.getContents() = Code; + RF.getFixups() = Fixups; + RemainingSize -= Delta; + return true; +} + /// Write a sequence of optimal nops to the output, covering \p Count /// bytes. /// \return - true on success, false on failure Index: llvm/test/MC/X86/align-branch-64.s =================================================================== --- llvm/test/MC/X86/align-branch-64.s +++ llvm/test/MC/X86/align-branch-64.s @@ -103,6 +103,59 @@ bar: retq + # CHECK: test_pad_via_relax: + # CHECK: 200: testq + # CHECK: 203: jne + # CHECK: 209: int3 + # note 6 byte jne which could be a 2 byte jne, but is instead + # expanded for padding purposes + # CHECK-NOT: nop + # CHECK: 220: callq + .global test_pad_via_relax + .p2align 5 +test_pad_via_relax: + testq %rax, %rax + jnz bar + .rept 23 + int3 + .endr + callq bar + + # This case looks really tempting to pad, but doing so for the call causes + # the jmp to be misaligned. + # CHECK: test_pad_via_relax_neg1: + # CHECK: 240: int3 + # CHECK: 25a: testq + # CHECK: 25d: jne + # CHECK: 25f: nop + # CHECK: 260: callq + .global test_pad_via_relax_neg1 + .p2align 5 +test_pad_via_relax_neg1: + .rept 26 + int3 + .endr + testq %rax, %rax + jnz bar + callq bar + + # Same as previous, but without fusion + # CHECK: test_pad_via_relax_neg2: + # CHECK: 280: int3 + # CHECK: 29d: jmp + # CHECK: 29f: nop + # CHECK: 2a0: callq + .global test_pad_via_relax_neg2 + .p2align 5 +test_pad_via_relax_neg2: + .rept 29 + int3 + .endr + jmp bar2 + callq bar2 + +bar2: + .section "unknown" .p2align 4 .type baz,@function Index: llvm/test/MC/X86/align-via-relaxation.s =================================================================== --- /dev/null +++ llvm/test/MC/X86/align-via-relaxation.s @@ -0,0 +1,74 @@ +# RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu %s | llvm-objdump -d --section=.text - | FileCheck %s + + + .file "test.c" + .text + .section .text + # Demonstrate that we can relax instructions to provide padding, not + # just insert nops. jmps are being used for ease of demonstration. + # CHECK: .text + # CHECK: 0: eb 1f jmp 31 + # CHECK: 2: e9 1a 00 00 00 jmp 26 + # CHECK: 7: e9 15 00 00 00 jmp 21 + # CHECK: c: e9 10 00 00 00 jmp 16 + # CHECK: 11: e9 0b 00 00 00 jmp 11 + # CHECK: 16: e9 06 00 00 00 jmp 6 + # CHECK: 1b: e9 01 00 00 00 jmp 1 + # CHECK: 20: cc int3 + .p2align 4 + jmp foo + jmp foo + jmp foo + jmp foo + jmp foo + jmp foo + jmp foo + .p2align 5 + int3 +foo: + ret + + # Check that we're not shifting aroudn the offsets of labels - doing + # that would require a further round of relaxation + # CHECK: bar: + # CHECK: 22: eb fe jmp -2 + # CHECK: 24: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax) + # CHECK: 2e: 66 90 nop + # CHECK: 30: 0f 0b ud2 + +bar: + jmp bar +nobypass: + .p2align 4 + ud2 + + + # Canonical toy loop to show benefit - we can align the loop header with + # fewer nops by relaxing the branch, even though we don't need to + # CHECK: loop_preheader: + # CHECK: 45: 48 85 c0 testq %rax, %rax + # CHECK: 48: 0f 8e 22 00 00 00 jle 34 + # CHECK: 4e: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax) + # CHECK: 58: 0f 1f 84 00 00 00 00 00 nopl (%rax,%rax) + # CHECK: loop_header: + # CHECK: 60: 48 83 e8 01 subq $1, %rax + # CHECK: 64: 48 85 c0 testq %rax, %rax + # CHECK: 67: 7e 07 jle 7 + # CHECK: 69: e9 f2 ff ff ff jmp -14 + # CHECK: 6e: 66 90 nop + # CHECK: loop_exit: + # CHECK: 70: c3 retq + .p2align 5 + .skip 5 +loop_preheader: + testq %rax, %rax + jle loop_exit + .p2align 5 +loop_header: + subq $1, %rax + testq %rax, %rax + jle loop_exit + jmp loop_header + .p2align 4 +loop_exit: + ret