Index: lld/trunk/ELF/Arch/PPC64.cpp =================================================================== --- lld/trunk/ELF/Arch/PPC64.cpp +++ lld/trunk/ELF/Arch/PPC64.cpp @@ -39,12 +39,23 @@ enum DFormOpcd { LBZ = 34, + LBZU = 35, LHZ = 40, + LHZU = 41, + LHAU = 43, LWZ = 32, + LWZU = 33, + LFSU = 49, LD = 58, + LFDU = 51, STB = 38, + STBU = 39, STH = 44, + STHU = 45, STW = 36, + STWU = 37, + STFSU = 53, + STFDU = 55, STD = 62, ADDI = 14 }; @@ -116,6 +127,30 @@ } } +static bool isInstructionUpdateForm(uint32_t Encoding) { + switch (getPrimaryOpCode(Encoding)) { + default: + return false; + case LBZU: + case LHAU: + case LHZU: + case LWZU: + case LFSU: + case LFDU: + case STBU: + case STHU: + case STWU: + case STFSU: + case STFDU: + return true; + // LWA has the same opcode as LD, and the DS bits is what differentiates + // between LD/LDU/LWA + case LD: + case STD: + return (Encoding & 3) == 1; + } +} + // There are a number of places when we either want to read or write an // instruction when handling a half16 relocation type. On big-endian the buffer // pointer is pointing into the middle of the word we want to extract, and on @@ -519,9 +554,15 @@ } } +static bool isTocRelType(RelType Type) { + return Type == R_PPC64_TOC16_HA || Type == R_PPC64_TOC16_LO_DS || + Type == R_PPC64_TOC16_LO; +} + void PPC64::relocateOne(uint8_t *Loc, RelType Type, uint64_t Val) const { // For a TOC-relative relocation, proceed in terms of the corresponding // ADDR16 relocation type. + bool IsTocRelType = isTocRelType(Type); std::tie(Type, Val) = toAddr16Rel(Type, Val); switch (Type) { @@ -549,7 +590,10 @@ case R_PPC64_ADDR16_HA: case R_PPC64_REL16_HA: case R_PPC64_TPREL16_HA: - write16(Loc, ha(Val)); + if (Config->TocOptimize && IsTocRelType && ha(Val) == 0) + writeInstrFromHalf16(Loc, 0x60000000); + else + write16(Loc, ha(Val)); break; case R_PPC64_ADDR16_HI: case R_PPC64_REL16_HI: @@ -575,14 +619,38 @@ case R_PPC64_ADDR16_LO: case R_PPC64_REL16_LO: case R_PPC64_TPREL16_LO: + // When the high-adjusted part of a toc relocation evalutes to 0, it is + // changed into a nop. The lo part then needs to be updated to use the + // toc-pointer register r2, as the base register. + if (Config->TocOptimize && IsTocRelType && ha(Val) == 0) { + uint32_t Instr = readInstrFromHalf16(Loc); + if (isInstructionUpdateForm(Instr)) + error(getErrorLocation(Loc) + + "can't toc-optimize an update instruction: 0x" + + utohexstr(Instr)); + Instr = (Instr & 0xFFE00000) | 0x00020000; + writeInstrFromHalf16(Loc, Instr); + } write16(Loc, lo(Val)); break; case R_PPC64_ADDR16_LO_DS: case R_PPC64_TPREL16_LO_DS: { // DQ-form instructions use bits 28-31 as part of the instruction encoding // DS-form instructions only use bits 30-31. - uint16_t Mask = isDQFormInstruction(readInstrFromHalf16(Loc)) ? 0xF : 0x3; + uint32_t Inst = readInstrFromHalf16(Loc); + uint16_t Mask = isDQFormInstruction(Inst) ? 0xF : 0x3; checkAlignment(Loc, lo(Val), Mask + 1, Type); + if (Config->TocOptimize && IsTocRelType && ha(Val) == 0) { + // When the high-adjusted part of a toc relocation evalutes to 0, it is + // changed into a nop. The lo part then needs to be updated to use the toc + // pointer register r2, as the base register. + if (isInstructionUpdateForm(Inst)) + error(getErrorLocation(Loc) + + "Can't toc-optimize an update instruction: 0x" + + Twine::utohexstr(Inst)); + Inst = (Inst & 0xFFE0000F) | 0x00020000; + writeInstrFromHalf16(Loc, Inst); + } write16(Loc, (read16(Loc) & Mask) | lo(Val)); } break; case R_PPC64_ADDR32: Index: lld/trunk/ELF/Config.h =================================================================== --- lld/trunk/ELF/Config.h +++ lld/trunk/ELF/Config.h @@ -171,6 +171,7 @@ bool Trace; bool ThinLTOEmitImportsFiles; bool ThinLTOIndexOnly; + bool TocOptimize; bool UndefinedVersion; bool UseAndroidRelrTags = false; bool WarnBackrefs; Index: lld/trunk/ELF/Driver.cpp =================================================================== --- lld/trunk/ELF/Driver.cpp +++ lld/trunk/ELF/Driver.cpp @@ -280,6 +280,9 @@ if (Config->FixCortexA53Errata843419 && Config->EMachine != EM_AARCH64) error("--fix-cortex-a53-843419 is only supported on AArch64 targets."); + if (Config->TocOptimize && Config->EMachine != EM_PPC64) + error("--toc-optimize is only supported on the PowerPC64 target."); + if (Config->Pie && Config->Shared) error("-shared and -pie may not be used together"); @@ -999,6 +1002,9 @@ Config->WriteAddends = Args.hasFlag(OPT_apply_dynamic_relocs, OPT_no_apply_dynamic_relocs, false) || !Config->IsRela; + + Config->TocOptimize = + Args.hasFlag(OPT_toc_optimize, OPT_no_toc_optimize, Machine == EM_PPC64); } // Returns a value of "-format" option. Index: lld/trunk/ELF/Options.td =================================================================== --- lld/trunk/ELF/Options.td +++ lld/trunk/ELF/Options.td @@ -315,6 +315,10 @@ "Run the linker multi-threaded (default)", "Do not run the linker multi-threaded">; +defm toc_optimize : B<"toc-optimize", + "(PowerPC64) Enable TOC related optimizations (default)", + "(PowerPC64) Disable TOC related optimizations">; + def trace: F<"trace">, HelpText<"Print the names of the input files">; defm trace_symbol: Eq<"trace-symbol", "Trace references to symbols">; Index: lld/trunk/test/ELF/ppc64-func-entry-points.s =================================================================== --- lld/trunk/test/ELF/ppc64-func-entry-points.s +++ lld/trunk/test/ELF/ppc64-func-entry-points.s @@ -75,6 +75,6 @@ // CHECK: foo_external_diff: // CHECK-NEXT: 10010080: {{.*}} addis 2, 12, 2 // CHECK-NEXT: 10010084: {{.*}} addi 2, 2, 32640 -// CHECK-NEXT: 10010088: {{.*}} addis 5, 2, 0 +// CHECK-NEXT: 10010088: {{.*}} nop // CHECK: foo_external_same: // CHECK-NEXT: 100100b0: {{.*}} add 3, 4, 3 Index: lld/trunk/test/ELF/ppc64-got-indirect.s =================================================================== --- lld/trunk/test/ELF/ppc64-got-indirect.s +++ lld/trunk/test/ELF/ppc64-got-indirect.s @@ -83,8 +83,8 @@ # CHECK: _start: # CHECK-NEXT: 10010000: {{.*}} addis 2, 12, 3 # CHECK-NEXT: 10010004: {{.*}} addi 2, 2, -32768 -# CHECK-NEXT: 10010008: {{.*}} addis 3, 2, 0 -# CHECK-NEXT: 1001000c: {{.*}} ld 3, -32760(3) +# CHECK-NEXT: 10010008: {{.*}} nop +# CHECK-NEXT: 1001000c: {{.*}} ld 3, -32760(2) # CHECK: 1001001c: {{.*}} lwa 3, 0(3) # CHECK-LE: Disassembly of section .data: Index: lld/trunk/test/ELF/ppc64-relocs.s =================================================================== --- lld/trunk/test/ELF/ppc64-relocs.s +++ lld/trunk/test/ELF/ppc64-relocs.s @@ -63,7 +63,7 @@ # CHECK: Disassembly of section .R_PPC64_TOC16_HA: # CHECK: .FR_PPC64_TOC16_HA: -# CHECK: 10010018: {{.*}} addis 1, 2, 0 +# CHECK: 10010018: {{.*}} nop .section .R_PPC64_REL24,"ax",@progbits .globl .FR_PPC64_REL24 @@ -160,8 +160,8 @@ # 0x10000190 + 0xfeb4 = 0x10010044 # CHECK: Disassembly of section .R_PPC64_REL32: # CHECK: .FR_PPC64_REL32: -# CHECK: 1001003c: {{.*}} addis 5, 2, 0 -# CHECK: 10010040: {{.*}} ld 5, -32736(5) +# CHECK: 1001003c: {{.*}} nop +# CHECK: 10010040: {{.*}} ld 5, -32736(2) # CHECK: 10010044: {{.*}} add 3, 3, 4 .section .R_PPC64_REL64, "ax",@progbits Index: lld/trunk/test/ELF/ppc64-toc-addis-nop-lqsq.s =================================================================== --- lld/trunk/test/ELF/ppc64-toc-addis-nop-lqsq.s +++ lld/trunk/test/ELF/ppc64-toc-addis-nop-lqsq.s @@ -0,0 +1,73 @@ +# REQUIRES: ppc + +# RUN: llvm-readelf -relocations --wide %p/Inputs/ppc64le-quadword-ldst.o | FileCheck --check-prefix=QuadInputRelocs %s + +# RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %p/Inputs/shared-ppc64.s -o %t2.o +# RUN: ld.lld -shared %t2.o -o %t2.so + +# RUN: ld.lld %t2.so %p/Inputs/ppc64le-quadword-ldst.o -o %t +# RUN: llvm-objdump -D %t | FileCheck --check-prefix=Dis %s + +# RUN: ld.lld --no-toc-optimize %t2.so %p/Inputs/ppc64le-quadword-ldst.o -o %t +# RUN: llvm-objdump -D %t | FileCheck --check-prefix=NoOpt %s + +# QuadInputRelocs: Relocation section '.rela.text' +# QuadInputRelocs: R_PPC64_TOC16_LO_DS 0000000000000000 quadLd +# QuadInputRelocs: R_PPC64_TOC16_LO_DS 0000000000000010 quadSt + +# The powerpc backend doesn't support the quadword load/store instructions yet. +# So they are tested by linking against an object file assembled with +# `as -mpower9 -o ppc64le-quadword-ldst.o in.s` and checking the encoding of +# the unknown instructions in the dissasembly. Source used as input: +#quads: +#.Lbegin_quads: +#.Lgep_quads: +# addis 2, 12, .TOC.-.Lgep_quads@ha +# addi 2, 2, .TOC.-.Lgep_quads@l +#.Llep_quads: +#.localentry quads, .Llep_quads-.Lgep_quads +# addis 3, 2, quadLd@toc@ha +# lq 4, quadLd@toc@l(3) +# addis 3, 2, quadSt@toc@ha +# stq 4, quadSt@toc@l(3) +# blr +# +# .p2align 4 +# .global quadLd +# .lcomm quadLd, 16 +# +# .global quadSt +# .lcomm quadSt, 16 + + +# e0 82 7f 70 decodes to | 111000 | 00100 | 00010 | 16-bit imm | +# | 56 | 4 | 2 | 32624 | +# which is `lq r4, 32624(r2)` +# f8 82 7f 82 decodes to | 111110 | 00100 | 00010 | 14-bit imm | 10 | +# | 62 | 4 | 2 | 8160 | 2 | +# The immediate represents a word offset so this dissasembles to: +# `stq r4, 32640(r2)` +# Dis-LABEL: quads: +# Dis-NEXT: addis +# Dis-NEXT: addi +# Dis-NEXT: nop +# Dis-NEXT: 70 7f 82 e0 +# Dis-NEXT: nop +# Dis-NEXT: 82 7f 82 f8 +# Dis-NEXT: blr + +# e0 83 7f 70 decodes to | 111000 | 00100 | 00011 | 16-bit imm | +# | 56 | 4 | 3 | 32624 | +# `lq r4, 32624(r3)` +# f8 83 7f 82 decodes to | 111110 | 00100 | 00010 | 14-bit imm | 10 | +# | 62 | 4 | 2 | 8160 | 2 | +# `stq r4, 32640(r3)` +# NoOpt-LABEL: quads: +# NoOpt-NEXT: addis +# NoOpt-NEXT: addi +# NoOpt-NEXT: addis 3, 2, 0 +# NoOpt-NEXT: 70 7f 83 e0 +# NoOpt-NEXT: addis 3, 2, 0 +# NoOpt-NEXT: 82 7f 83 f8 +# NoOpt-NEXT: blr + Index: lld/trunk/test/ELF/ppc64-toc-addis-nop.s =================================================================== --- lld/trunk/test/ELF/ppc64-toc-addis-nop.s +++ lld/trunk/test/ELF/ppc64-toc-addis-nop.s @@ -0,0 +1,272 @@ +# REQUIRES: ppc + +# RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %s -o %t.o +# RUN: llvm-readelf -relocations --wide %t.o | FileCheck --check-prefix=InputRelocs %s + +# RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %p/Inputs/shared-ppc64.s -o %t2.o +# RUN: ld.lld -shared %t2.o -o %t2.so +# +# RUN: ld.lld %t2.so %t.o -o %t +# RUN: llvm-objdump -D %t | FileCheck --check-prefix=Dis %s +# +# RUN: ld.lld --no-toc-optimize %t2.so %t.o -o %t +# RUN: llvm-objdump -D %t | FileCheck --check-prefix=NoOpt %s + +# InputRelocs: Relocation section '.rela.text' +# InputRelocs: R_PPC64_TOC16_HA +# InputRelocs: R_PPC64_TOC16_LO +# InputRelocs: R_PPC64_TOC16_LO_DS + + + .text + .abiversion 2 + + .global bytes + .p2align 4 + .type bytes,@function +bytes: +.Lbytes_gep: + addis 2, 12, .TOC.-.Lbytes_gep@ha + addi 2, 2, .TOC.-.Lbytes_gep@l +.Lbytes_lep: + .localentry bytes, .Lbytes_lep-.Lbytes_gep + addis 3, 2, byteLd@toc@ha + lbz 3, byteLd@toc@l(3) + addis 4, 2, byteSt@toc@ha + stb 3, byteSt@toc@l(4) + blr +# Dis-LABEL: bytes +# Dis-NEXT: addis +# Dis-NEXT: addi +# Dis-NEXT: nop +# Dis-NEXT: lbz 3, 32624(2) +# Dis-NEXT: nop +# Dis-NEXT: stb 3, 32625(2) +# Dis-NEXT: blr + +# NoOpt-LABEL: bytes +# NoOpt-NEXT: addis +# NoOpt-NEXT: addi +# NoOpt-NEXT: addis 3, 2, 0 +# NoOpt-NEXT: lbz 3, 32624(3) +# NoOpt-NEXT: addis 4, 2, 0 +# NoOpt-NEXT: stb 3, 32625(4) +# NoOpt-NEXT: blr + + .global halfs + .p2align 4 + .type halfs,@function +halfs: +.Lhalfs_gep: + addis 2, 12, .TOC.-.Lhalfs_gep@ha + addi 2, 2, .TOC.-.Lhalfs_gep@l +.Lhalfs_lep: + .localentry halfs, .Lhalfs_lep-.Lhalfs_gep + addis 3, 2, halfLd@toc@ha + lhz 3, halfLd@toc@l(3) + addis 4, 2, halfLd@toc@ha + lha 4, halfLd@toc@l(4) + addis 5, 2, halfSt@toc@ha + sth 4, halfSt@toc@l(5) + blr +# Dis-LABEL: halfs +# Dis-NEXT: addis +# Dis-NEXT: addi +# Dis-NEXT: nop +# Dis-NEXT: lhz 3, 32626(2) +# Dis-NEXT: nop +# Dis-NEXT: lha 4, 32626(2) +# Dis-NEXT: nop +# Dis-NEXT: sth 4, 32628(2) +# Dis-NEXT: blr + +# NoOpt-LABEL: halfs +# NoOpt-NEXT: addis +# NoOpt-NEXT: addi +# NoOpt-NEXT: addis 3, 2, 0 +# NoOpt-NEXT: lhz 3, 32626(3) +# NoOpt-NEXT: addis 4, 2, 0 +# NoOpt-NEXT: lha 4, 32626(4) +# NoOpt-NEXT: addis 5, 2, 0 +# NoOpt-NEXT: sth 4, 32628(5) +# NoOpt-NEXT: blr + + + .global words + .p2align 4 + .type words,@function +words: +.Lwords_gep: + addis 2, 12, .TOC.-.Lwords_gep@ha + addi 2, 2, .TOC.-.Lwords_gep@l +.Lwords_lep: + .localentry words, .Lwords_lep-.Lwords_gep + addis 3, 2, wordLd@toc@ha + lwz 3, wordLd@toc@l(3) + addis 4, 2, wordLd@toc@ha + lwa 4, wordLd@toc@l(4) + addis 5, 2, wordSt@toc@ha + stw 4, wordSt@toc@l(5) + blr +# Dis-LABEL: words +# Dis-NEXT: addis +# Dis-NEXT: addi +# Dis-NEXT: nop +# Dis-NEXT: lwz 3, 32632(2) +# Dis-NEXT: nop +# Dis-NEXT: lwa 4, 32632(2) +# Dis-NEXT: nop +# Dis-NEXT: stw 4, 32636(2) +# Dis-NEXT: blr + +# NoOpt-LABEL: words +# NoOpt-NEXT: addis +# NoOpt-NEXT: addi +# NoOpt-NEXT: addis 3, 2, 0 +# NoOpt-NEXT: lwz 3, 32632(3) +# NoOpt-NEXT: addis 4, 2, 0 +# NoOpt-NEXT: lwa 4, 32632(4) +# NoOpt-NEXT: addis 5, 2, 0 +# NoOpt-NEXT: stw 4, 32636(5) +# NoOpt-NEXT: blr + + .global doublewords + .p2align 4 + .type doublewords,@function +doublewords: +.Ldoublewords_gep: + addis 2, 12, .TOC.-.Ldoublewords_gep@ha + addi 2, 2, .TOC.-.Ldoublewords_gep@l +.Ldoublewords_lep: + .localentry doublewords, .Ldoublewords_lep-.Ldoublewords_gep + addis 3, 2, dwordLd@toc@ha + ld 3, dwordLd@toc@l(3) + addis 4, 2, dwordSt@toc@ha + std 3, dwordSt@toc@l(4) + blr + +# Dis-LABEL: doublewords +# Dis-NEXT: addis +# Dis-NEXT: addi +# Dis-NEXT: nop +# Dis-NEXT: ld 3, 32640(2) +# Dis-NEXT: nop +# Dis-NEXT: std 3, 32648(2) +# Dis-NEXT: blr + +# NoOpt-LABEL: doublewords +# NoOpt-NEXT: addis +# NoOpt-NEXT: addi +# NoOpt-NEXT: addis 3, 2, 0 +# NoOpt-NEXT: ld 3, 32640(3) +# NoOpt-NEXT: addis 4, 2, 0 +# NoOpt-NEXT: std 3, 32648(4) +# NoOpt-NEXT: blr + + .global vec_dq + .p2align 4 + .type vec_dq,@function +vec_dq: +.Lvec_dq_gep: + addis 2, 12, .TOC.-.Lvec_dq_gep@ha + addi 2, 2, .TOC.-.Lvec_dq_gep@l +.Lvec_dq_lep: + .localentry vec_dq, .Lvec_dq_lep-.Lvec_dq_gep + addis 3, 2, vecLd@toc@ha + lxv 3, vecLd@toc@l(3) + addis 3, 2, vecSt@toc@ha + stxv 3, vecSt@toc@l(3) + blr + +# Dis-LABEL: vec_dq +# Dis-NEXT: addis +# Dis-NEXT: addi +# Dis-NEXT: nop +# Dis-NEXT: lxv 3, 32656(2) +# Dis-NEXT: nop +# Dis-NEXT: stxv 3, 32672(2) +# Dis-NEXT: blr + +# NoOpt-LABEL: vec_dq +# NoOpt-NEXT: addis +# NoOpt-NEXT: addi +# NoOpt-NEXT: addis 3, 2, 0 +# NoOpt-NEXT: lxv 3, 32656(3) +# NoOpt-NEXT: addis 3, 2, 0 +# NoOpt-NEXT: stxv 3, 32672(3) +# NoOpt-NEXT: blr + + .global vec_ds + .p2align 4 + .type vec_ds,@function +vec_ds: +.Lvec_ds_gep: + addis 2, 12, .TOC.-.Lvec_ds_gep@ha + addi 2, 2, .TOC.-.Lvec_ds_gep@l +.Lvec_ds_lep: + .localentry vec_ds, .Lvec_dq_lep-.Lvec_dq_gep + addis 3, 2, vecLd@toc@ha + lxsd 3, vecLd@toc@l(3) + addis 3, 2, vecSt@toc@ha + stxsd 3, vecSt@toc@l(3) + addis 3, 2, vecLd@toc@ha + lxssp 3, vecLd@toc@l(3) + addis 3, 2, vecSt@toc@ha + stxssp 3, vecSt@toc@l(3) + blr +# Dis-LABEL: vec_ds +# Dis-NEXT: addis +# Dis-NEXT: addi +# Dis-NEXT: nop +# Dis-NEXT: lxsd 3, 32656(2) +# Dis-NEXT: nop +# Dis-NEXT: stxsd 3, 32672(2) +# Dis-NEXT: nop +# Dis-NEXT: lxssp 3, 32656(2) +# Dis-NEXT: nop +# Dis-NEXT: stxssp 3, 32672(2) +# Dis-NEXT: blr + +# NoOpt-LABEL: vec_ds +# NoOpt-NEXT: addis +# NoOpt-NEXT: addi +# NoOpt-NEXT: addis 3, 2, 0 +# NoOpt-NEXT: lxsd 3, 32656(3) +# NoOpt-NEXT: addis 3, 2, 0 +# NoOpt-NEXT: stxsd 3, 32672(3) +# NoOpt-NEXT: addis 3, 2, 0 +# NoOpt-NEXT: lxssp 3, 32656(3) +# NoOpt-NEXT: addis 3, 2, 0 +# NoOpt-NEXT: stxssp 3, 32672(3) +# NoOpt-NEXT: blr + + + .global byteLd + .lcomm byteLd, 1, 1 + + .global byteSt + .lcomm byteSt, 1, 1 + + .global halfLd + .lcomm halfLd, 2, 2 + + .global halfSt + .lcomm halfSt, 2, 2 + + .global wordLd + .lcomm wordLd, 4, 4 + + .global wordSt + .lcomm wordSt, 4, 4 + + .global dwordLd + .lcomm dwordLd, 8, 8 + + .global dwordSt + .lcomm dwordSt, 8, 8 + + .global vecLd + .lcomm vecLd, 16, 16 + + .global vecSt + .lcomm vecSt, 16, 16 Index: lld/trunk/test/ELF/ppc64-tocopt-option.s =================================================================== --- lld/trunk/test/ELF/ppc64-tocopt-option.s +++ lld/trunk/test/ELF/ppc64-tocopt-option.s @@ -0,0 +1,14 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t +# RUN: not ld.lld %t --toc-optimize -o /dev/null 2>&1 | FileCheck %s + +# CHECK: error: --toc-optimize is only supported on the PowerPC64 target. + + .global __start + .type __start,@function + + .text + .quad 0 + __start: +