Index: lld/trunk/ELF/InputSection.cpp =================================================================== --- lld/trunk/ELF/InputSection.cpp +++ lld/trunk/ELF/InputSection.cpp @@ -134,6 +134,12 @@ continue; } + if (Target->isTlsOptimized(Type, Body)) { + Target->relocateTlsOptimize(BufLoc, BufEnd, AddrLoc, + getSymVA(Body)); + continue; + } + uintX_t SymVA = getSymVA(Body); if (Target->relocNeedsPlt(Type, Body)) { SymVA = Out::Plt->getEntryAddr(Body); Index: lld/trunk/ELF/Target.h =================================================================== --- lld/trunk/ELF/Target.h +++ lld/trunk/ELF/Target.h @@ -59,7 +59,9 @@ virtual bool relocNeedsPlt(uint32_t Type, const SymbolBody &S) const = 0; virtual void relocateOne(uint8_t *Loc, uint8_t *BufEnd, uint32_t Type, uint64_t P, uint64_t SA) const = 0; - + virtual bool isTlsOptimized(unsigned Type, const SymbolBody &S) const; + virtual void relocateTlsOptimize(uint8_t *Loc, uint8_t *BufEnd, uint64_t P, + uint64_t SA) const; virtual ~TargetInfo(); protected: Index: lld/trunk/ELF/Target.cpp =================================================================== --- lld/trunk/ELF/Target.cpp +++ lld/trunk/ELF/Target.cpp @@ -74,6 +74,9 @@ void relocateOne(uint8_t *Loc, uint8_t *BufEnd, uint32_t Type, uint64_t P, uint64_t SA) const override; bool isRelRelative(uint32_t Type) const override; + bool isTlsOptimized(unsigned Type, const SymbolBody &S) const override; + void relocateTlsOptimize(uint8_t *Loc, uint8_t *BufEnd, uint64_t P, + uint64_t SA) const override; }; class PPC64TargetInfo final : public TargetInfo { @@ -148,6 +151,10 @@ TargetInfo::~TargetInfo() {} +bool TargetInfo::isTlsOptimized(unsigned Type, const SymbolBody &S) const { + return false; +} + uint64_t TargetInfo::getVAStart() const { return Config->Shared ? 0 : VAStart; } bool TargetInfo::relocNeedsCopy(uint32_t Type, const SymbolBody &S) const { @@ -162,6 +169,9 @@ bool TargetInfo::isRelRelative(uint32_t Type) const { return true; } +void TargetInfo::relocateTlsOptimize(uint8_t *Loc, uint8_t *BufEnd, uint64_t P, + uint64_t SA) const {} + void TargetInfo::writeGotHeaderEntries(uint8_t *Buf) const {} void TargetInfo::writeGotPltHeaderEntries(uint8_t *Buf) const {} @@ -279,6 +289,8 @@ } bool X86_64TargetInfo::relocNeedsGot(uint32_t Type, const SymbolBody &S) const { + if (Type == R_X86_64_GOTTPOFF) + return !isTlsOptimized(Type, S); return Type == R_X86_64_GOTTPOFF || Type == R_X86_64_GOTPCREL || relocNeedsPlt(Type, S); } @@ -344,6 +356,48 @@ } } +bool X86_64TargetInfo::isTlsOptimized(unsigned Type, + const SymbolBody &S) const { + if (Config->Shared || !S.isTLS()) + return false; + return Type == R_X86_64_GOTTPOFF && !canBePreempted(&S, true); +} + +// In some conditions, R_X86_64_GOTTPOFF relocation can be optimized to +// R_X86_64_TPOFF32 so that R_X86_64_TPOFF32 so that it does not use GOT. +// This function does that. Read "ELF Handling For Thread-Local Storage, +// 5.5 x86-x64 linker optimizations" (http://www.akkadia.org/drepper/tls.pdf) +// by Ulrich Drepper for details. +void X86_64TargetInfo::relocateTlsOptimize(uint8_t *Loc, uint8_t *BufEnd, + uint64_t P, uint64_t SA) const { + // Ulrich's document section 6.5 says that @gottpoff(%rip) must be + // used in MOVQ or ADDQ instructions only. + // "MOVQ foo@GOTTPOFF(%RIP), %REG" is transformed to "MOVQ $foo, %REG". + // "ADDQ foo@GOTTPOFF(%RIP), %REG" is transformed to "LEAQ foo(%REG), %REG" + // (if the register is not RSP/R12) or "ADDQ $foo, %RSP". + // Opcodes info can be found at http://ref.x86asm.net/coder64.html#x48. + uint8_t *Prefix = Loc - 3; + uint8_t *Inst = Loc - 2; + uint8_t *RegSlot = Loc - 1; + uint8_t Reg = Loc[-1] >> 3; + bool IsMov = *Inst == 0x8b; + bool RspAdd = !IsMov && Reg == 4; + // r12 and rsp registers requires special handling. + // Problem is that for other registers, for example leaq 0xXXXXXXXX(%r11),%r11 + // result out is 7 bytes: 4d 8d 9b XX XX XX XX, + // but leaq 0xXXXXXXXX(%r12),%r12 is 8 bytes: 4d 8d a4 24 XX XX XX XX. + // The same true for rsp. So we convert to addq for them, saving 1 byte that + // we dont have. + if (RspAdd) + *Inst = 0x81; + else + *Inst = IsMov ? 0xc7 : 0x8d; + if (*Prefix == 0x4c) + *Prefix = (IsMov || RspAdd) ? 0x49 : 0x4d; + *RegSlot = (IsMov || RspAdd) ? (0xc0 | Reg) : (0x80 | Reg | (Reg << 3)); + relocateOne(Loc, BufEnd, R_X86_64_TPOFF32, P, SA); +} + void X86_64TargetInfo::relocateOne(uint8_t *Loc, uint8_t *BufEnd, uint32_t Type, uint64_t P, uint64_t SA) const { switch (Type) { Index: lld/trunk/test/ELF/tls-opt.s =================================================================== --- lld/trunk/test/ELF/tls-opt.s +++ lld/trunk/test/ELF/tls-opt.s @@ -0,0 +1,64 @@ +// RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o +// RUN: ld.lld %t.o -o %t1 +// RUN: llvm-readobj -r %t1 | FileCheck --check-prefix=NORELOC %s +// RUN: llvm-objdump -d %t1 | FileCheck --check-prefix=DISASM %s + +// NORELOC: Relocations [ +// NORELOC-NEXT: ] + +// DISASM: Disassembly of section .text: +// DISASM-NEXT: _start: +// DISASM-NEXT: 11000: 48 c7 c0 f8 ff ff ff movq $-8, %rax +// DISASM-NEXT: 11007: 49 c7 c7 f8 ff ff ff movq $-8, %r15 +// DISASM-NEXT: 1100e: 48 8d 80 f8 ff ff ff leaq -8(%rax), %rax +// DISASM-NEXT: 11015: 4d 8d bf f8 ff ff ff leaq -8(%r15), %r15 +// DISASM-NEXT: 1101c: 48 81 c4 f8 ff ff ff addq $-8, %rsp +// DISASM-NEXT: 11023: 49 81 c4 f8 ff ff ff addq $-8, %r12 +// DISASM-NEXT: 1102a: 48 c7 c0 fc ff ff ff movq $-4, %rax +// DISASM-NEXT: 11031: 49 c7 c7 fc ff ff ff movq $-4, %r15 +// DISASM-NEXT: 11038: 48 8d 80 fc ff ff ff leaq -4(%rax), %rax +// DISASM-NEXT: 1103f: 4d 8d bf fc ff ff ff leaq -4(%r15), %r15 +// DISASM-NEXT: 11046: 48 81 c4 fc ff ff ff addq $-4, %rsp +// DISASM-NEXT: 1104d: 49 81 c4 fc ff ff ff addq $-4, %r12 + +// Corrupred output: +// DISASM-NEXT: 11054: 48 8d 80 f8 ff ff ff leaq -8(%rax), %rax +// DISASM-NEXT: 1105b: 48 d1 81 c4 f8 ff ff rolq -1852(%rcx) +// DISASM-NEXT: 11062: ff 48 d1 decl -47(%rax) +// DISASM-NEXT: 11065: 81 c4 f8 ff ff ff addl $4294967288, %esp + +.type tls0,@object +.section .tbss,"awT",@nobits +.globl tls0 +.align 4 +tls0: + .long 0 + .size tls0, 4 + +.type tls1,@object +.globl tls1 +.align 4 +tls1: + .long 0 + .size tls1, 4 + +.section .text +.globl _start +_start: + movq tls0@GOTTPOFF(%rip), %rax + movq tls0@GOTTPOFF(%rip), %r15 + addq tls0@GOTTPOFF(%rip), %rax + addq tls0@GOTTPOFF(%rip), %r15 + addq tls0@GOTTPOFF(%rip), %rsp + addq tls0@GOTTPOFF(%rip), %r12 + movq tls1@GOTTPOFF(%rip), %rax + movq tls1@GOTTPOFF(%rip), %r15 + addq tls1@GOTTPOFF(%rip), %rax + addq tls1@GOTTPOFF(%rip), %r15 + addq tls1@GOTTPOFF(%rip), %rsp + addq tls1@GOTTPOFF(%rip), %r12 + + //Invalid input case: + xchgq tls0@gottpoff(%rip),%rax + shlq tls0@gottpoff + rolq tls0@gottpoff