Index: ELF/InputSection.cpp =================================================================== --- ELF/InputSection.cpp +++ ELF/InputSection.cpp @@ -98,7 +98,9 @@ uint8_t *Buf, uint8_t *BufEnd, iterator_range *> Rels) { typedef Elf_Rel_Impl RelType; - for (const RelType &RI : Rels) { + size_t Num = Rels.end() - Rels.begin(); + for (size_t I = 0; I < Num; ++I) { + const RelType &RI = *(Rels.begin() + I); uint32_t SymIndex = RI.getSymbol(Config->Mips64EL); uint32_t Type = RI.getType(Config->Mips64EL); uintX_t Offset = getOffset(RI.r_offset); @@ -108,7 +110,8 @@ uint8_t *BufLoc = Buf + Offset; uintX_t AddrLoc = OutSec->getVA() + Offset; - if (Target->isTlsLocalDynamicReloc(Type)) { + if (Target->isTlsLocalDynamicReloc(Type) && + (!Target->isTlsOptimized(Type, nullptr))) { Target->relocateOne(BufLoc, BufEnd, Type, AddrLoc, Out::Got->getVA() + Out::LocalModuleTlsIndexOffset + @@ -127,13 +130,20 @@ SymbolBody &Body = *File->getSymbolBody(SymIndex)->repl(); - if (Target->isTlsGlobalDynamicReloc(Type)) { + if (Target->isTlsGlobalDynamicReloc(Type) && + (!Target->isTlsOptimized(Type, &Body))) { Target->relocateOne(BufLoc, BufEnd, Type, AddrLoc, Out::Got->getEntryAddr(Body) + getAddend(RI)); continue; } + if (Body.isTLS() && Target->isTlsOptimized(Type, &Body)) { + Target->relocateTlsOptimize(BufLoc, Buf, BufEnd, Type, AddrLoc, + getSymVA(Body), I); + continue; + } + uintX_t SymVA = getSymVA(Body); if (Target->relocNeedsPlt(Type, Body)) { SymVA = Out::Plt->getEntryAddr(Body); Index: ELF/Target.h =================================================================== --- ELF/Target.h +++ ELF/Target.h @@ -59,7 +59,10 @@ virtual bool relocNeedsPlt(uint32_t Type, const SymbolBody &S) const = 0; virtual void relocateOne(uint8_t *Loc, uint8_t *BufEnd, uint32_t Type, uint64_t P, uint64_t SA) const = 0; - + virtual bool isTlsOptimized(unsigned Type, const SymbolBody *S) const; + virtual void relocateTlsOptimize(uint8_t *Loc, uint8_t *BufStart, + uint8_t *BufEnd, uint32_t Type, uint64_t P, + uint64_t SA, size_t &RelNdx) const; virtual ~TargetInfo(); protected: Index: ELF/Target.cpp =================================================================== --- ELF/Target.cpp +++ ELF/Target.cpp @@ -74,6 +74,16 @@ void relocateOne(uint8_t *Loc, uint8_t *BufEnd, uint32_t Type, uint64_t P, uint64_t SA) const override; bool isRelRelative(uint32_t Type) const override; + bool isTlsOptimized(unsigned Type, const SymbolBody *S) const override; + void relocateTlsOptimize(uint8_t *Loc, uint8_t *BufStart, uint8_t *BufEnd, + uint32_t Type, uint64_t P, uint64_t SA, + size_t &RelNdx) const override; + +private: + void relocateTlsLdToLe(uint8_t *Loc, uint8_t *BufStart, uint8_t *BufEnd, + uint64_t P, uint64_t SA, size_t &RelNdx) const; + void relocateTlsGdToLe(uint8_t *Loc, uint8_t *BufStart, uint8_t *BufEnd, + uint64_t P, uint64_t SA, size_t &RelNdx) const; }; class PPC64TargetInfo final : public TargetInfo { @@ -147,6 +157,10 @@ TargetInfo::~TargetInfo() {} +bool TargetInfo::isTlsOptimized(unsigned Type, const SymbolBody *S) const { + return false; +} + uint64_t TargetInfo::getVAStart() const { return Config->Shared ? 0 : VAStart; } bool TargetInfo::relocNeedsCopy(uint32_t Type, const SymbolBody &S) const { @@ -159,6 +173,10 @@ bool TargetInfo::isRelRelative(uint32_t Type) const { return true; } +void TargetInfo::relocateTlsOptimize(uint8_t *Loc, uint8_t *BufStart, + uint8_t *BufEnd, uint32_t Type, uint64_t P, + uint64_t SA, size_t &RelNdx) const {} + void TargetInfo::writeGotHeaderEntries(uint8_t *Buf) const {} void TargetInfo::writeGotPltHeaderEntries(uint8_t *Buf) const {} @@ -341,6 +359,80 @@ } } +bool X86_64TargetInfo::isTlsOptimized(unsigned Type, + const SymbolBody *S) const { + if (Config->Shared) + return false; + return (Type == R_X86_64_TLSLD) || (Type == R_X86_64_DTPOFF32) || + (Type == R_X86_64_TLSGD && !canBePreempted(S, true)); +} + +// "Ulrich Drepper, ELF Handling For Thread-Local Storage" (5.5 +// x86-x64 linker optimizations, http://www.akkadia.org/drepper/tls.pdf) shows how +// LD can be optimized to LE: +// leaq bar@tlsld(%rip), %rdi +// callq __tls_get_addr@PLT +// leaq bar@dtpoff(%rax), %rcx +// Is converted to: +// .word 0x6666 +// .byte 0x66 +// mov %fs:0,%rax +// leaq bar@tpoff(%rax), %rcx +void X86_64TargetInfo::relocateTlsLdToLe(uint8_t *Loc, uint8_t *BufStart, + uint8_t *BufEnd, uint64_t P, + uint64_t SA, size_t &RelNdx) const { + if (Loc - 3 < BufStart) + error("Tls relocation LdToLe optimization fail, buffer overrun !"); + const uint8_t Inst[] = { + 0x66, 0x66, //.word 0x6666 + 0x66, //.byte 0x66 + 0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00 // mov %fs:0,%rax + }; + memcpy(Loc - 3, Inst, sizeof(Inst)); + // The next relocation should be against __tls_get_addr, so skip it. + ++RelNdx; +} + +// "Ulrich Drepper, ELF Handling For Thread-Local Storage" (5.5 +// x86-x64 linker optimizations, http://www.akkadia.org/drepper/tls.pdf) shows how +// GD can be optimized to LE: +// .byte 0x66 +// leaq x@tlsgd(%rip), %rdi +// .word 0x6666 +// rex64 +// call __tls_get_addr@plt +// Is converted to: +// mov %fs:0x0,%rax +// lea x@tpoff,%rax +void X86_64TargetInfo::relocateTlsGdToLe(uint8_t *Loc, uint8_t *BufStart, + uint8_t *BufEnd, uint64_t P, + uint64_t SA, size_t &RelNdx) const { + if (Loc - 4 < BufStart || Loc + 12 > BufEnd) + error("Tls relocation GdToLe optimization fail, buffer overrun !"); + const uint8_t Inst[] = { + 0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:0x0,%rax + 0x48, 0x8d, 0x80, 0x00, 0x00, 0x00, 0x00 // lea x@tpoff,%rax + }; + memcpy(Loc - 4, Inst, sizeof(Inst)); + relocateOne(Loc + 8, BufEnd, R_X86_64_TPOFF32, P, SA); + // The next relocation should be against __tls_get_addr, so skip it. + ++RelNdx; +} + +void X86_64TargetInfo::relocateTlsOptimize(uint8_t *Loc, uint8_t *BufStart, + uint8_t *BufEnd, uint32_t Type, + uint64_t P, uint64_t SA, + size_t &RelNdx) const { + if (Type == R_X86_64_TLSLD) + relocateTlsLdToLe(Loc, BufStart, BufEnd, P, SA, RelNdx); + else if (Type == R_X86_64_TLSGD) + relocateTlsGdToLe(Loc, BufStart, BufEnd, P, SA, RelNdx); + else if (Type == R_X86_64_DTPOFF32) + relocateOne(Loc, BufEnd, R_X86_64_TPOFF32, P, SA); + else + error("Unknown TLS optimization"); +} + void X86_64TargetInfo::relocateOne(uint8_t *Loc, uint8_t *BufEnd, uint32_t Type, uint64_t P, uint64_t SA) const { switch (Type) { Index: ELF/Writer.cpp =================================================================== --- ELF/Writer.cpp +++ ELF/Writer.cpp @@ -200,6 +200,8 @@ uint32_t Type = RI.getType(Config->Mips64EL); if (Target->isTlsLocalDynamicReloc(Type)) { + if (Target->isTlsOptimized(Type, nullptr)) + continue; if (Out::LocalModuleTlsIndexOffset == uint32_t(-1)) { Out::LocalModuleTlsIndexOffset = Out::Got->addLocalModuleTlsIndex(); @@ -217,6 +219,8 @@ Body = Body->repl(); if (Body && Body->isTLS() && Target->isTlsGlobalDynamicReloc(Type)) { + if (Target->isTlsOptimized(Type, Body)) + continue; if (Body->isInGot()) continue; Out::Got->addDynTlsEntry(Body); Index: test/ELF/tls-opt.s =================================================================== --- test/ELF/tls-opt.s +++ test/ELF/tls-opt.s @@ -0,0 +1,60 @@ +// RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o +// RUN: ld.lld -e main %t.o -o %t1 +// RUN: llvm-readobj -r %t1 | FileCheck --check-prefix=NORELOC %s +// RUN: llvm-objdump -d %t1 | FileCheck --check-prefix=DISASM %s + +// NORELOC: Relocations [ +// NORELOC-NEXT: ] + +// DISASM: Disassembly of section .text: +// DISASM-NEXT: main: +//LD to LE: +// DISASM-NEXT: 11000: 66 66 66 64 48 8b 04 25 00 00 00 00 movq %fs:0, %rax +// DISASM-NEXT: 1100c: 48 8d 88 f8 ff ff ff leaq -8(%rax), %rcx +// DISASM-NEXT: 11013: 66 66 66 64 48 8b 04 25 00 00 00 00 movq %fs:0, %rax +// DISASM-NEXT: 1101f: 48 8d 88 fc ff ff ff leaq -4(%rax), %rcx +//GD to LE: +// DISASM-NEXT: 11026: 64 48 8b 04 25 00 00 00 00 movq %fs:0, %rax +// DISASM-NEXT: 1102f: 48 8d 80 f8 ff ff ff leaq -8(%rax), %rax +// DISASM-NEXT: 11036: 64 48 8b 04 25 00 00 00 00 movq %fs:0, %rax +// DISASM-NEXT: 1103f: 48 8d 80 fc ff ff ff leaq -4(%rax), %rax + +.type tls0,@object +.section .tbss,"awT",@nobits +.globl tls0 +.align 4 +tls0: + .long 0 + .size tls0, 4 + +.type tls1,@object +.globl tls1 +.align 4 +tls1: + .long 0 + .size tls1, 4 + +.text + .globl main + .align 16, 0x90 + .type main,@function +main: + //LD to LE: + leaq tls0@tlsld(%rip), %rdi + callq __tls_get_addr@PLT + leaq tls0@dtpoff(%rax),%rcx + leaq tls1@tlsld(%rip), %rdi + callq __tls_get_addr@PLT + leaq tls1@dtpoff(%rax),%rcx + + //GD to LE: + .byte 0x66 + leaq tls0@tlsgd(%rip),%rdi + .word 0x6666 + rex64 + call __tls_get_addr@plt + .byte 0x66 + leaq tls1@tlsgd(%rip),%rdi + .word 0x6666 + rex64 + call __tls_get_addr@plt