diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -12,7 +12,9 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCELFObjectWriter.h" @@ -103,6 +105,14 @@ "assumptions about labels corresponding to particular instructions, " "and should be used with caution.")); +cl::opt X86PadForAlign( + "x86-pad-for-align", cl::init(true), cl::Hidden, + cl::desc("Pad previous instructions to implement align directives")); + +cl::opt X86PadForBranchAlign( + "x86-pad-for-branch-align", cl::init(true), cl::Hidden, + cl::desc("Pad previous instructions to implement branch alignment")); + class X86ELFObjectWriter : public MCELFObjectTargetWriter { public: X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine, @@ -173,6 +183,10 @@ void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, MCInst &Res) const override; + bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter, + unsigned &RemainingSize) const; + void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override; + bool writeNopData(raw_ostream &OS, uint64_t Count) const override; }; } // end anonymous namespace @@ -639,6 +653,168 @@ Res.setOpcode(RelaxedOp); } +static bool canBeRelaxedForPadding(const MCRelaxableFragment &RF) { + // TODO: There are lots of other tricks we could apply for increasing + // encoding size without impacting performance. + auto &Inst = RF.getInst(); + auto &STI = *RF.getSubtargetInfo(); + bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit]; + return getRelaxedOpcode(Inst, is16BitMode) != Inst.getOpcode(); +} + +bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF, + MCCodeEmitter &Emitter, + unsigned &RemainingSize) const { + if (!canBeRelaxedForPadding(RF)) + return false; + + MCInst Relaxed; + relaxInstruction(RF.getInst(), *RF.getSubtargetInfo(), Relaxed); + + SmallVector Fixups; + SmallString<15> Code; + raw_svector_ostream VecOS(Code); + Emitter.encodeInstruction(Relaxed, VecOS, Fixups, *RF.getSubtargetInfo()); + const unsigned OldSize = RF.getContents().size(); + const unsigned NewSize = Code.size(); + assert(NewSize >= OldSize && "size decrease during relaxation?"); + unsigned Delta = NewSize - OldSize; + if (Delta > RemainingSize) + return false; + RF.setInst(Relaxed); + RF.getContents() = Code; + RF.getFixups() = Fixups; + RemainingSize -= Delta; + return true; +} + +void X86AsmBackend::finishLayout(MCAssembler const &Asm, + MCAsmLayout &Layout) const { + // See if we can further relax some instructions to cut down on the number of + // nop bytes required for code alignment. The actual win is in reducing + // instruction count, not number of bytes. Modern X86-64 can easily end up + // decode limited. It is often better to reduce the number of instructions + // (i.e. eliminate nops) even at the cost of increasing the size and + // complexity of others. + if (!X86PadForAlign && !X86PadForBranchAlign) + return; + + DenseSet LabeledFragments; + for (const MCSymbol &S : Asm.symbols()) + LabeledFragments.insert(S.getFragment(false)); + + for (MCSection &Sec : Asm) { + if (!Sec.getKind().isText()) + continue; + + SmallVector Relaxable; + for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) { + MCFragment &F = *I; + + if (LabeledFragments.count(&F)) + Relaxable.clear(); + + if (F.getKind() == MCFragment::FT_Data || + F.getKind() == MCFragment::FT_CompactEncodedInst) + // Skip and ignore + continue; + + if (F.getKind() == MCFragment::FT_Relaxable) { + auto &RF = cast(*I); + Relaxable.push_back(&RF); + continue; + } + + auto canHandle = [](MCFragment &F) -> bool { + switch (F.getKind()) { + default: + return false; + case MCFragment::FT_Align: + return X86PadForAlign; + case MCFragment::FT_BoundaryAlign: + return X86PadForBranchAlign; + } + }; + // For any unhandled kind, assume we can't change layout. + if (!canHandle(F)) { + Relaxable.clear(); + continue; + } + + const uint64_t OrigOffset = Layout.getFragmentOffset(&F); + const uint64_t OrigSize = Asm.computeFragmentSize(Layout, F); + if (OrigSize == 0 || Relaxable.empty()) { + Relaxable.clear(); + continue; + } + + // To keep the effects local, prefer to relax instructions closest to + // the align directive. This is purely about human understandability + // of the resulting code. If we later find a reason to expand + // particular instructions over others, we can adjust. + MCFragment *FirstChangedFragment = nullptr; + unsigned RemainingSize = OrigSize; + while (!Relaxable.empty() && RemainingSize != 0) { + auto &RF = *Relaxable.pop_back_val(); + // Give the backend a chance to play any tricks it wishes to increase + // the encoding size of the given instruction. Target independent code + // will try further relaxation, but target's may play further tricks. + if (padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize)) + FirstChangedFragment = &RF; + + // If we have an instruction which hasn't been fully relaxed, we can't + // skip past it and insert bytes before it. Changing its starting + // offset might require a larger negative offset than it can encode. + // We don't need to worry about larger positive offsets as none of the + // possible offsets between this and our align are visible, and the + // ones afterwards aren't changing. + if (mayNeedRelaxation(RF.getInst(), *RF.getSubtargetInfo())) + break; + } + Relaxable.clear(); + + if (FirstChangedFragment) { + // Make sure the offsets for any fragments in the effected range get + // updated. Note that this (conservatively) invalidates the offsets of + // those following, but this is not required. + Layout.invalidateFragmentsFrom(FirstChangedFragment); + } + + // BoundaryAlign explicitly tracks it's size (unlike align) + if (F.getKind() == MCFragment::FT_BoundaryAlign) + cast(F).setSize(RemainingSize); + + const uint64_t FinalOffset = Layout.getFragmentOffset(&F); + const uint64_t FinalSize = Asm.computeFragmentSize(Layout, F); + assert(OrigOffset + OrigSize == FinalOffset + FinalSize && + "can't move start of next fragment!"); + assert(FinalSize == RemainingSize && "inconsistent size computation?"); + + // If we're looking at a boundary align, make sure we don't try to pad + // its target instructions for some following directive. Doing so would + // break the alignment of the current boundary align. + if (F.getKind() == MCFragment::FT_BoundaryAlign) { + auto &BF = cast(F); + const MCFragment *F = BF.getNextNode(); + // If the branch is unfused, it is emitted into one fragment, otherwise + // it is emitted into two fragments at most, the next + // MCBoundaryAlignFragment(if exists) also marks the end of the branch. + for (int i = 0, N = BF.isFused() ? 2 : 1; + i != N && !isa(F); + ++i, F = F->getNextNode(), I++) { + } + } + } + } + + // The layout is done. Mark every fragment as valid. + for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) { + MCSection &Section = *Layout.getSectionOrder()[i]; + Layout.getFragmentOffset(&*Section.getFragmentList().rbegin()); + Asm.computeFragmentSize(Layout, *Section.getFragmentList().rbegin()); + } +} + /// Write a sequence of optimal nops to the output, covering \p Count /// bytes. /// \return - true on success, false on failure diff --git a/llvm/test/MC/X86/align-branch-64.s b/llvm/test/MC/X86/align-branch-64.s --- a/llvm/test/MC/X86/align-branch-64.s +++ b/llvm/test/MC/X86/align-branch-64.s @@ -103,6 +103,59 @@ bar: retq + # CHECK: test_pad_via_relax: + # CHECK: 200: testq + # CHECK: 203: jne + # CHECK: 209: int3 + # note 6 byte jne which could be a 2 byte jne, but is instead + # expanded for padding purposes + # CHECK-NOT: nop + # CHECK: 220: callq + .global test_pad_via_relax + .p2align 5 +test_pad_via_relax: + testq %rax, %rax + jnz bar + .rept 23 + int3 + .endr + callq bar + + # This case looks really tempting to pad, but doing so for the call causes + # the jmp to be misaligned. + # CHECK: test_pad_via_relax_neg1: + # CHECK: 240: int3 + # CHECK: 25a: testq + # CHECK: 25d: jne + # CHECK: 25f: nop + # CHECK: 260: callq + .global test_pad_via_relax_neg1 + .p2align 5 +test_pad_via_relax_neg1: + .rept 26 + int3 + .endr + testq %rax, %rax + jnz bar + callq bar + + # Same as previous, but without fusion + # CHECK: test_pad_via_relax_neg2: + # CHECK: 280: int3 + # CHECK: 29d: jmp + # CHECK: 29f: nop + # CHECK: 2a0: callq + .global test_pad_via_relax_neg2 + .p2align 5 +test_pad_via_relax_neg2: + .rept 29 + int3 + .endr + jmp bar2 + callq bar2 + +bar2: + .section "unknown" .p2align 4 .type baz,@function diff --git a/llvm/test/MC/X86/align-via-relaxation.s b/llvm/test/MC/X86/align-via-relaxation.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/align-via-relaxation.s @@ -0,0 +1,74 @@ +# RUN: llvm-mc -mcpu=skylake -filetype=obj -triple x86_64-pc-linux-gnu %s | llvm-objdump -d --section=.text - | FileCheck %s + + + .file "test.c" + .text + .section .text + # Demonstrate that we can relax instructions to provide padding, not + # just insert nops. jmps are being used for ease of demonstration. + # CHECK: .text + # CHECK: 0: eb 1f jmp 31 + # CHECK: 2: e9 1a 00 00 00 jmp 26 + # CHECK: 7: e9 15 00 00 00 jmp 21 + # CHECK: c: e9 10 00 00 00 jmp 16 + # CHECK: 11: e9 0b 00 00 00 jmp 11 + # CHECK: 16: e9 06 00 00 00 jmp 6 + # CHECK: 1b: e9 01 00 00 00 jmp 1 + # CHECK: 20: cc int3 + .p2align 4 + jmp foo + jmp foo + jmp foo + jmp foo + jmp foo + jmp foo + jmp foo + .p2align 5 + int3 +foo: + ret + + # Check that we're not shifting aroudn the offsets of labels - doing + # that would require a further round of relaxation + # CHECK: bar: + # CHECK: 22: eb fe jmp -2 + # CHECK: 24: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax) + # CHECK: 2e: 66 90 nop + # CHECK: 30: 0f 0b ud2 + +bar: + jmp bar +nobypass: + .p2align 4 + ud2 + + + # Canonical toy loop to show benefit - we can align the loop header with + # fewer nops by relaxing the branch, even though we don't need to + # CHECK: loop_preheader: + # CHECK: 45: 48 85 c0 testq %rax, %rax + # CHECK: 48: 0f 8e 22 00 00 00 jle 34 + # CHECK: 4e: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax) + # CHECK: 58: 0f 1f 84 00 00 00 00 00 nopl (%rax,%rax) + # CHECK: loop_header: + # CHECK: 60: 48 83 e8 01 subq $1, %rax + # CHECK: 64: 48 85 c0 testq %rax, %rax + # CHECK: 67: 7e 07 jle 7 + # CHECK: 69: e9 f2 ff ff ff jmp -14 + # CHECK: 6e: 66 90 nop + # CHECK: loop_exit: + # CHECK: 70: c3 retq + .p2align 5 + .skip 5 +loop_preheader: + testq %rax, %rax + jle loop_exit + .p2align 5 +loop_header: + subq $1, %rax + testq %rax, %rax + jle loop_exit + jmp loop_header + .p2align 4 +loop_exit: + ret