Index: lld/trunk/ELF/InputSection.cpp =================================================================== --- lld/trunk/ELF/InputSection.cpp +++ lld/trunk/ELF/InputSection.cpp @@ -98,7 +98,9 @@ uint8_t *Buf, uint8_t *BufEnd, iterator_range *> Rels) { typedef Elf_Rel_Impl RelType; - for (const RelType &RI : Rels) { + size_t Num = Rels.end() - Rels.begin(); + for (size_t I = 0; I < Num; ++I) { + const RelType &RI = *(Rels.begin() + I); uint32_t SymIndex = RI.getSymbol(Config->Mips64EL); uint32_t Type = RI.getType(Config->Mips64EL); uintX_t Offset = getOffset(RI.r_offset); @@ -108,7 +110,8 @@ uint8_t *BufLoc = Buf + Offset; uintX_t AddrLoc = OutSec->getVA() + Offset; - if (Target->isTlsLocalDynamicReloc(Type)) { + if (Target->isTlsLocalDynamicReloc(Type) && + !Target->isTlsOptimized(Type, nullptr)) { Target->relocateOne(BufLoc, BufEnd, Type, AddrLoc, Out::Got->getVA() + Out::LocalModuleTlsIndexOffset + @@ -127,16 +130,20 @@ SymbolBody &Body = *File->getSymbolBody(SymIndex)->repl(); - if (Target->isTlsGlobalDynamicReloc(Type)) { + if (Target->isTlsGlobalDynamicReloc(Type) && + !Target->isTlsOptimized(Type, &Body)) { Target->relocateOne(BufLoc, BufEnd, Type, AddrLoc, Out::Got->getEntryAddr(Body) + getAddend(RI)); continue; } - if (Target->isTlsOptimized(Type, Body)) { - Target->relocateTlsOptimize(BufLoc, BufEnd, AddrLoc, - getSymVA(Body)); + if (Target->isTlsOptimized(Type, &Body)) { + // By optimizing TLS relocations, it is sometimes needed to skip + // relocations that immediately follow TLS relocations. This function + // knows how many slots we need to skip. + I += Target->relocateTlsOptimize(BufLoc, BufEnd, Type, AddrLoc, + getSymVA(Body)); continue; } Index: lld/trunk/ELF/Target.h =================================================================== --- lld/trunk/ELF/Target.h +++ lld/trunk/ELF/Target.h @@ -59,9 +59,10 @@ virtual bool relocNeedsPlt(uint32_t Type, const SymbolBody &S) const = 0; virtual void relocateOne(uint8_t *Loc, uint8_t *BufEnd, uint32_t Type, uint64_t P, uint64_t SA) const = 0; - virtual bool isTlsOptimized(unsigned Type, const SymbolBody &S) const; - virtual void relocateTlsOptimize(uint8_t *Loc, uint8_t *BufEnd, uint64_t P, - uint64_t SA) const; + virtual bool isTlsOptimized(unsigned Type, const SymbolBody *S) const; + virtual unsigned relocateTlsOptimize(uint8_t *Loc, uint8_t *BufEnd, + uint32_t Type, uint64_t P, + uint64_t SA) const; virtual ~TargetInfo(); protected: Index: lld/trunk/ELF/Target.cpp =================================================================== --- lld/trunk/ELF/Target.cpp +++ lld/trunk/ELF/Target.cpp @@ -80,9 +80,17 @@ void relocateOne(uint8_t *Loc, uint8_t *BufEnd, uint32_t Type, uint64_t P, uint64_t SA) const override; bool isRelRelative(uint32_t Type) const override; - bool isTlsOptimized(unsigned Type, const SymbolBody &S) const override; - void relocateTlsOptimize(uint8_t *Loc, uint8_t *BufEnd, uint64_t P, - uint64_t SA) const override; + bool isTlsOptimized(unsigned Type, const SymbolBody *S) const override; + unsigned relocateTlsOptimize(uint8_t *Loc, uint8_t *BufEnd, uint32_t Type, + uint64_t P, uint64_t SA) const override; + +private: + void relocateTlsLdToLe(uint8_t *Loc, uint8_t *BufEnd, uint64_t P, + uint64_t SA) const; + void relocateTlsGdToLe(uint8_t *Loc, uint8_t *BufEnd, uint64_t P, + uint64_t SA) const; + void relocateTlsIeToLe(uint8_t *Loc, uint8_t *BufEnd, uint64_t P, + uint64_t SA) const; }; class PPC64TargetInfo final : public TargetInfo { @@ -161,7 +169,7 @@ TargetInfo::~TargetInfo() {} -bool TargetInfo::isTlsOptimized(unsigned Type, const SymbolBody &S) const { +bool TargetInfo::isTlsOptimized(unsigned Type, const SymbolBody *S) const { return false; } @@ -177,8 +185,11 @@ bool TargetInfo::isRelRelative(uint32_t Type) const { return true; } -void TargetInfo::relocateTlsOptimize(uint8_t *Loc, uint8_t *BufEnd, uint64_t P, - uint64_t SA) const {} +unsigned TargetInfo::relocateTlsOptimize(uint8_t *Loc, uint8_t *BufEnd, + uint32_t Type, uint64_t P, + uint64_t SA) const { + return 0; +} void TargetInfo::writeGotHeaderEntries(uint8_t *Buf) const {} @@ -364,7 +375,7 @@ bool X86_64TargetInfo::relocNeedsGot(uint32_t Type, const SymbolBody &S) const { if (Type == R_X86_64_GOTTPOFF) - return !isTlsOptimized(Type, S); + return !isTlsOptimized(Type, &S); return Type == R_X86_64_GOTTPOFF || Type == R_X86_64_GOTPCREL || relocNeedsPlt(Type, S); } @@ -435,10 +446,54 @@ } bool X86_64TargetInfo::isTlsOptimized(unsigned Type, - const SymbolBody &S) const { - if (Config->Shared || !S.isTLS()) + const SymbolBody *S) const { + if (Config->Shared || (S && !S->isTLS())) return false; - return Type == R_X86_64_GOTTPOFF && !canBePreempted(&S, true); + return Type == R_X86_64_TLSLD || Type == R_X86_64_DTPOFF32 || + (Type == R_X86_64_TLSGD && !canBePreempted(S, true)) || + (Type == R_X86_64_GOTTPOFF && !canBePreempted(S, true)); +} + +// "Ulrich Drepper, ELF Handling For Thread-Local Storage" (5.5 +// x86-x64 linker optimizations, http://www.akkadia.org/drepper/tls.pdf) shows +// how LD can be optimized to LE: +// leaq bar@tlsld(%rip), %rdi +// callq __tls_get_addr@PLT +// leaq bar@dtpoff(%rax), %rcx +// Is converted to: +// .word 0x6666 +// .byte 0x66 +// mov %fs:0,%rax +// leaq bar@tpoff(%rax), %rcx +void X86_64TargetInfo::relocateTlsLdToLe(uint8_t *Loc, uint8_t *BufEnd, + uint64_t P, uint64_t SA) const { + const uint8_t Inst[] = { + 0x66, 0x66, //.word 0x6666 + 0x66, //.byte 0x66 + 0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00 // mov %fs:0,%rax + }; + memcpy(Loc - 3, Inst, sizeof(Inst)); +} + +// "Ulrich Drepper, ELF Handling For Thread-Local Storage" (5.5 +// x86-x64 linker optimizations, http://www.akkadia.org/drepper/tls.pdf) shows +// how GD can be optimized to LE: +// .byte 0x66 +// leaq x@tlsgd(%rip), %rdi +// .word 0x6666 +// rex64 +// call __tls_get_addr@plt +// Is converted to: +// mov %fs:0x0,%rax +// lea x@tpoff,%rax +void X86_64TargetInfo::relocateTlsGdToLe(uint8_t *Loc, uint8_t *BufEnd, + uint64_t P, uint64_t SA) const { + const uint8_t Inst[] = { + 0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:0x0,%rax + 0x48, 0x8d, 0x80, 0x00, 0x00, 0x00, 0x00 // lea x@tpoff,%rax + }; + memcpy(Loc - 4, Inst, sizeof(Inst)); + relocateOne(Loc + 8, BufEnd, R_X86_64_TPOFF32, P, SA); } // In some conditions, R_X86_64_GOTTPOFF relocation can be optimized to @@ -446,8 +501,8 @@ // This function does that. Read "ELF Handling For Thread-Local Storage, // 5.5 x86-x64 linker optimizations" (http://www.akkadia.org/drepper/tls.pdf) // by Ulrich Drepper for details. -void X86_64TargetInfo::relocateTlsOptimize(uint8_t *Loc, uint8_t *BufEnd, - uint64_t P, uint64_t SA) const { +void X86_64TargetInfo::relocateTlsIeToLe(uint8_t *Loc, uint8_t *BufEnd, + uint64_t P, uint64_t SA) const { // Ulrich's document section 6.5 says that @gottpoff(%rip) must be // used in MOVQ or ADDQ instructions only. // "MOVQ foo@GOTTPOFF(%RIP), %REG" is transformed to "MOVQ $foo, %REG". @@ -476,6 +531,33 @@ relocateOne(Loc, BufEnd, R_X86_64_TPOFF32, P, SA); } +// This function applies a TLS relocation with an optimization as described +// in the Ulrich's document. As a result of rewriting instructions at the +// relocation target, relocations immediately follow the TLS relocation (which +// would be applied to rewritten instructions) may have to be skipped. +// This function returns a number of relocations that need to be skipped. +unsigned X86_64TargetInfo::relocateTlsOptimize(uint8_t *Loc, uint8_t *BufEnd, + uint32_t Type, uint64_t P, + uint64_t SA) const { + switch (Type) { + case R_X86_64_GOTTPOFF: + relocateTlsIeToLe(Loc, BufEnd, P, SA); + return 0; + case R_X86_64_TLSLD: + relocateTlsLdToLe(Loc, BufEnd, P, SA); + // The next relocation should be against __tls_get_addr, so skip it + return 1; + case R_X86_64_TLSGD: + relocateTlsGdToLe(Loc, BufEnd, P, SA); + // The next relocation should be against __tls_get_addr, so skip it + return 1; + case R_X86_64_DTPOFF32: + relocateOne(Loc, BufEnd, R_X86_64_TPOFF32, P, SA); + return 0; + } + llvm_unreachable("Unknown TLS optimization"); +} + void X86_64TargetInfo::relocateOne(uint8_t *Loc, uint8_t *BufEnd, uint32_t Type, uint64_t P, uint64_t SA) const { switch (Type) { Index: lld/trunk/ELF/Writer.cpp =================================================================== --- lld/trunk/ELF/Writer.cpp +++ lld/trunk/ELF/Writer.cpp @@ -203,6 +203,8 @@ uint32_t Type = RI.getType(Config->Mips64EL); if (Target->isTlsLocalDynamicReloc(Type)) { + if (Target->isTlsOptimized(Type, nullptr)) + continue; if (Out::LocalModuleTlsIndexOffset == uint32_t(-1)) { Out::LocalModuleTlsIndexOffset = Out::Got->addLocalModuleTlsIndex(); @@ -220,6 +222,8 @@ Body = Body->repl(); if (Body && Body->isTLS() && Target->isTlsGlobalDynamicReloc(Type)) { + if (Target->isTlsOptimized(Type, Body)) + continue; if (Body->isInGot()) continue; Out::Got->addDynTlsEntry(Body); Index: lld/trunk/test/ELF/tls-opt.s =================================================================== --- lld/trunk/test/ELF/tls-opt.s +++ lld/trunk/test/ELF/tls-opt.s @@ -20,12 +20,21 @@ // DISASM-NEXT: 1103f: 4d 8d bf fc ff ff ff leaq -4(%r15), %r15 // DISASM-NEXT: 11046: 48 81 c4 fc ff ff ff addq $-4, %rsp // DISASM-NEXT: 1104d: 49 81 c4 fc ff ff ff addq $-4, %r12 - // Corrupred output: // DISASM-NEXT: 11054: 48 8d 80 f8 ff ff ff leaq -8(%rax), %rax // DISASM-NEXT: 1105b: 48 d1 81 c4 f8 ff ff rolq -1852(%rcx) // DISASM-NEXT: 11062: ff 48 d1 decl -47(%rax) // DISASM-NEXT: 11065: 81 c4 f8 ff ff ff addl $4294967288, %esp +// LD to LE: +// DISASM-NEXT: 1106b: 66 66 66 64 48 8b 04 25 00 00 00 00 movq %fs:0, %rax +// DISASM-NEXT: 11077: 48 8d 88 f8 ff ff ff leaq -8(%rax), %rcx +// DISASM-NEXT: 1107e: 66 66 66 64 48 8b 04 25 00 00 00 00 movq %fs:0, %rax +// DISASM-NEXT: 1108a: 48 8d 88 fc ff ff ff leaq -4(%rax), %rcx +// GD to LE: +// DISASM-NEXT: 11091: 64 48 8b 04 25 00 00 00 00 movq %fs:0, %rax +// DISASM-NEXT: 1109a: 48 8d 80 f8 ff ff ff leaq -8(%rax), %rax +// DISASM-NEXT: 110a1: 64 48 8b 04 25 00 00 00 00 movq %fs:0, %rax +// DISASM-NEXT: 110aa: 48 8d 80 fc ff ff ff leaq -4(%rax), %rax .type tls0,@object .section .tbss,"awT",@nobits @@ -62,3 +71,23 @@ xchgq tls0@gottpoff(%rip),%rax shlq tls0@gottpoff rolq tls0@gottpoff + + //LD to LE: + leaq tls0@tlsld(%rip), %rdi + callq __tls_get_addr@PLT + leaq tls0@dtpoff(%rax),%rcx + leaq tls1@tlsld(%rip), %rdi + callq __tls_get_addr@PLT + leaq tls1@dtpoff(%rax),%rcx + + //GD to LE: + .byte 0x66 + leaq tls0@tlsgd(%rip),%rdi + .word 0x6666 + rex64 + call __tls_get_addr@plt + .byte 0x66 + leaq tls1@tlsgd(%rip),%rdi + .word 0x6666 + rex64 + call __tls_get_addr@plt