diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp --- a/lld/ELF/Arch/RISCV.cpp +++ b/lld/ELF/Arch/RISCV.cpp @@ -269,9 +269,9 @@ case R_RISCV_TPREL_LO12_I: case R_RISCV_TPREL_LO12_S: return R_TPREL; - case R_RISCV_RELAX: case R_RISCV_TPREL_ADD: return R_NONE; + case R_RISCV_RELAX: case R_RISCV_ALIGN: return R_RELAX_HINT; default: @@ -466,34 +466,98 @@ break; case R_RISCV_RELAX: - return; // Ignored (for now) + return; // Processed in a later pass default: llvm_unreachable("unknown relocation"); } } -using AdjustRange = InputSectionBase::AdjustRange; -using AdjustRanges = SmallVector; +using AdjustRanges = SmallVector; + +// Relax R_RISCV_CALL to jal, c.j, or c.jal. +// Return the original instruction pair in case we need to undo the relaxation. +static uint64_t relaxCall(InputSection *isec, Relocation &r, int64_t &delta, + AdjustRanges &ranges) { + Defined *d = dyn_cast_or_null(r.sym); + if (!d || !d->section) + return 0; + bool rvc = config->eflags & EF_RISCV_RVC; + uint64_t insnPair = read64le(isec->data().data() + r.offset); + unsigned rd = extractBits(insnPair, 32 + 11, 32 + 7); + uint64_t pc = isec->getVA(r.offset) + delta; + uint64_t target = r.expr == R_PLT_PC ? d->getPltVA() : d->getVA(r.addend); + int64_t displace = target - pc; + + // Convert to c.j if displace fits in 12 bits. + if (rvc && isInt<12>(displace) && rd == 0) { + write16le(isec->mutableData().data() + r.offset, 0xa001); // c.j + r.type = R_RISCV_RVC_JUMP; + ranges.push_back({r.offset + 2, -6}); + delta -= 6; + return insnPair; + } + // Convert to c.jal (RV32 only) if displace fits in 12 bits. + if (rvc && isInt<12>(displace) && rd == X_RA && !config->is64) { + write16le(isec->mutableData().data() + r.offset, 0x2001); // c.jal + r.type = R_RISCV_RVC_JUMP; + ranges.push_back({r.offset + 2, -6}); + delta -= 6; + return insnPair; + } + // Convert to jal if displace fits in 21 bits. + if (isInt<21>(displace)) { + write32le(isec->mutableData().data() + r.offset, 0x6f | rd << 7); // jal + r.type = R_RISCV_JAL; + ranges.push_back({r.offset + 4, -4}); + delta -= 4; + return insnPair; + } + return 0; +} + +// Verify that the displacement still fits within the smaller immediate +// field of the relaxed instruction. If not, we will need to undo it. +static bool unrelaxCall(InputSection *isec, Relocation &r, int64_t &delta, + AdjustRanges &ranges) { + auto *d = dyn_cast_or_null(r.sym); + assert(d && "r.sym not defined in unrelaxCall"); + uint64_t pc = isec->getVA(r.offset) + delta; + uint64_t target = r.expr == R_PLT_PC ? d->getPltVA() : d->getVA(r.addend); + int64_t displace = target - pc; + + if (r.type == R_RISCV_RVC_JUMP && !isInt<12>(displace)) { + r.type = R_RISCV_CALL; + ranges.push_back({r.offset, 6}); + delta += 6; + // warn(isec->getObjMsg(r.offset) + ": undo c.jal relaxation"); + return true; + } + if (r.type == R_RISCV_JAL && !isInt<21>(displace)) { + r.type = R_RISCV_CALL; + ranges.push_back({r.offset, 4}); + delta += 4; + // warn(isec->getObjMsg(r.offset) + ": undo jal relaxation"); + return true; + } + // This relaxation is still valid, so don't undo + return false; +} // As input, the addend of R_RISCV_ALIGN holds the number of NOP bytes emitted -// by the compiler. We derive the desired alignment boundary by rounding this up -// to the nearest power of two. The multi-pass relaxation algorithm needs two -// quantities per R_RISCV_ALIGN: current NOP byte count, and alignment -// boundary. Once we alter the NOP byte count, we lose the capacity to correctly -// derive the alignment boundary. Therefore, we must derive the boundary from -// the initial value of NOP byte count and then store it. The 64-bit addend -// member is more than wide enough to keep both the NOP byte count and alignment -// boundary. +// by the compiler. We derive the desired alignment boundary by rounding this +// up to the nearest power of two. The multi-pass relaxation algorithm needs +// two quantities per R_RISCV_ALIGN: current NOP byte count, and alignment +// boundary. Once we alter the NOP byte count, we lose the capacity to +// correctly derive the alignment boundary. Therefore, we must derive the +// boundary from the initial value of NOP byte count and then store it. The +// 64-bit addend member is more than wide enough to keep both the NOP byte +// count and alignment boundary. struct AlignAddend { uint32_t bytes; uint32_t boundary; }; -// NOTE: The code structure is more complex than necessary -// for handling R_RISCV_ALIGN alone. It is designed to accommodate -// call/jump/load/store/addr-arithmetic relocs in later diffs. - // Derive & store alignment boundaries for all R_RISCV_ALIGN relocs static void setAlignBoundaries() { for (OutputSection *osec : outputSections) @@ -530,41 +594,62 @@ } } +// After one or more contractions and/or expansions of the address range that +// rounds-up to the alignment boundary, the sequence of NOPs emitted by the +// compiler could be corrupted. E.g., a 4-byte NOP might be truncated to 2 +// bytes, requiring a rewrite to the proper C.NOP encoding. We repair by +// rewriting an optimal sequence of NOPs. +static void fillAlignGap(InputSection *isec, Relocation &r) { + uint8_t *buf = isec->mutableData().data() + r.offset; + AlignAddend *aa = reinterpret_cast(&r.addend); + int keepNopBytes = aa->bytes; + while (keepNopBytes >= 4) { + // Greedily fill as many 4-byte NOPs as possible + write32le(buf, 0x00000013); // nop + keepNopBytes -= 4; + buf += 4; + } + if (keepNopBytes == 2) { + // Complete the sequence with C.NOP if space remains after + // the 4-byte NOPs. + assert(config->eflags & EF_RISCV_RVC && "expected RVC for 2-byte NOP"); + write16le(buf, 0x0001); // c.nop + keepNopBytes -= 2; + buf += 2; + } + assert(keepNopBytes == 0); +} + // Fill the gaps created by adding bytes (when delta > 0) to the section: // * After the alignment gap length is known, fill with NOPs. +// * After a relaxation is undone, restore the original instruction(s). static void fillAdjustGaps() { for (OutputSection *osec : outputSections) for (InputSection *isec : getInputSections(*osec)) { if (!(isec->flags & SHF_EXECINSTR)) continue; - for (Relocation &r : isec->relocations) + MutableArrayRef rels = isec->relocations; + for (auto it = rels.begin(); it != rels.end(); ++it) { + Relocation &r = *it; if (r.type == R_RISCV_ALIGN) { - // After one or more contractions and/or expansions of the address - // range that rounds-up to the alignment boundary, the sequence of - // NOPs emitted by the compiler could be corrupted. E.g., a 4-byte - // NOP might be truncated to 2 bytes, requiring a rewrite to the - // proper C.NOP encoding. We repair by rewriting an optimal - // sequence of NOPs. - uint8_t *buf = isec->mutableData().data() + r.offset; - AlignAddend *aa = reinterpret_cast(&r.addend); - int keepNopBytes = aa->bytes; - while (keepNopBytes >= 4) { - // Greedily fill as many 4-byte NOPs as possible - write32le(buf, 0x00000013); // nop - keepNopBytes -= 4; - buf += 4; - } - if (keepNopBytes == 2) { - // Complete the sequence with C.NOP if space remains after - // the 4-byte NOPs. - assert(config->eflags & EF_RISCV_RVC && - "expected RVC for 2-byte NOP"); - write16le(buf, 0x0001); // c.nop - keepNopBytes -= 2; - buf += 2; - } - assert(keepNopBytes == 0); + fillAlignGap(isec, r); + continue; } + if (!config->relax || it + 1 == rels.end() || it[1].addend == 0 || + it[1].type != R_RISCV_NONE || r.offset != it[1].offset) + continue; + // This reloc has a paired R_RISCV_RELAX at it[1] with matching + // offset and non-zero addend, so it might need the original + // instruction restored for an undone relaxation + uint8_t *buf = isec->mutableData().data() + r.offset; + switch (r.type) { + case R_RISCV_CALL: + write64le(buf, it[1].addend); + break; + default: + llvm_unreachable("unknown relaxation undo"); + } + } } } @@ -588,11 +673,60 @@ for (InputSection *isec : getInputSections(*osec)) { if (!(isec->flags & SHF_EXECINSTR)) continue; + int64_t delta = 0; - for (Relocation &r : isec->relocations) { - if (r.type == R_RISCV_ALIGN) + MutableArrayRef rels = isec->relocations; + for (auto it = rels.begin(); it != rels.end(); ++it) { + Relocation &r = *it; + if (r.type == R_RISCV_ALIGN) { relaxAlign(isec, r, delta, ranges); - // TODO: handle call/jump/load/store/addr-arithmetic relaxation + continue; + } + if (!config->relax || it + 1 == rels.end() || + it[1].type != R_RISCV_RELAX || r.offset != it[1].offset) + continue; + // This reloc has a paired R_RISCV_RELAX at it[1] with matching + // offset, so it is eligible for relaxation. The addend of + // R_RISCV_RELAX is unused, so we save the original unrelaxed + // instruction(s) there in case we need to undo the relaxation + // in a later pass. + switch (r.type) { + case R_RISCV_CALL: + case R_RISCV_CALL_PLT: + it[1].addend = relaxCall(isec, r, delta, ranges); + break; + case R_RISCV_RVC_JUMP: + case R_RISCV_JAL: + // Undo R_RISCV_CALL(_PLT) + if (unrelaxCall(isec, r, delta, ranges)) + it[1].type = R_RISCV_NONE; // Mark for undo, and prevent redo + break; + case R_RISCV_HI20: + case R_RISCV_PCREL_HI20: + // TODO: relax hi20 + break; + case R_RISCV_RVC_LUI: + // TODO: undo HI20 relaxation + break; + case R_RISCV_LO12_I: + case R_RISCV_LO12_S: + // TODO: relax lo12 + break; + case R_RISCV_PCREL_LO12_I: + case R_RISCV_PCREL_LO12_S: + // TODO: relax pcrel lo12 + break; + case R_RISCV_TPREL_HI20: + case R_RISCV_TPREL_ADD: + case R_RISCV_TPREL_LO12_I: + case R_RISCV_TPREL_LO12_S: + // TODO: relax tls + break; + case R_RISCV_NONE: + break; + default: + llvm_unreachable("unknown relaxation"); + } } if (!ranges.empty()) { isec->adjustRanges(ranges, sectionSymbolAddrs[isec]); @@ -613,8 +747,10 @@ setAlignBoundaries(); SectionSymbolAddrs sectionSymbolAddrs; fillSectionSymbolAddrs(sectionSymbolAddrs); + int passes = 0; while (relaxOnce(sectionSymbolAddrs)) - ; + passes++; + // warn("relaxed in " + Twine(passes) + " passes"); fillAdjustGaps(); } diff --git a/lld/test/ELF/riscv-relax-call-plt.s b/lld/test/ELF/riscv-relax-call-plt.s new file mode 100644 --- /dev/null +++ b/lld/test/ELF/riscv-relax-call-plt.s @@ -0,0 +1,44 @@ +# REQUIRES: riscv +# RUN: rm -rf %t && mkdir -p %t && cd %t + +# RUN: llvm-mc -filetype=obj -triple=riscv32-unknown-elf \ +# RUN: -mattr=+c,+relax %s -o rv32.o +# RUN: llvm-mc -filetype=obj -triple=riscv64-unknown-elf \ +# RUN: -mattr=+c,+relax %s -o rv64.o + +# Verify that recursive call when linked without `-shared` goes directly +# to the call target and can relax to `c.j`. +# +# RUN: ld.lld rv32.o -o rv32 +# RUN: ld.lld rv64.o -o rv64 +# RUN: llvm-objdump -d -M no-aliases rv32 > rv32.dis +# RUN: llvm-objdump -d -M no-aliases rv64 > rv64.dis +# RUN: FileCheck --check-prefix=RELAX %s < rv32.dis +# RUN: FileCheck --check-prefix=RELAX %s < rv64.dis + +# RELAX: <_start>: +# RELAX-NEXT: c.nop +# RELAX-NEXT: c.j {{.+}} <_start> + +# Verify that recursive call when linked with `-shared` goes through +# the PLT, which is too far away to relax as `c.j`. +# +# RUN: ld.lld -shared rv32.o -o plt.rv32 +# RUN: ld.lld -shared rv64.o -o plt.rv64 +# RUN: llvm-objdump -d -M no-aliases plt.rv32 > plt.rv32.dis +# RUN: llvm-objdump -d -M no-aliases plt.rv64 > plt.rv64.dis +# RUN: FileCheck --check-prefix=PLT %s < plt.rv32.dis +# RUN: FileCheck --check-prefix=PLT %s < plt.rv64.dis + +# PLT: <_start>: +# PLT-NEXT: c.nop +# PLT-NEXT: jal +# PLT-DAG: <.plt>: + +.global _start +_start: + nop + tail _start + .rept 0x800 + .byte 0 + .endr diff --git a/lld/test/ELF/riscv-relax-call-undo.s b/lld/test/ELF/riscv-relax-call-undo.s new file mode 100644 --- /dev/null +++ b/lld/test/ELF/riscv-relax-call-undo.s @@ -0,0 +1,105 @@ +# REQUIRES: riscv +# RUN: rm -rf %t && split-file %s %t && cd %t + +# RUN: llvm-mc -filetype=obj -triple=riscv32-unknown-elf \ +# RUN: -mattr=+relax a.s -o a.rv32.o +# RUN: llvm-mc -filetype=obj -triple=riscv32-unknown-elf \ +# RUN: -mattr=+relax z.s -o z.rv32.o +# RUN: llvm-mc -filetype=obj -triple=riscv64-unknown-elf \ +# RUN: -mattr=+relax a.s -o a.rv64.o +# RUN: llvm-mc -filetype=obj -triple=riscv64-unknown-elf \ +# RUN: -mattr=+relax z.s -o z.rv64.o + +# A later relaxation pass can cause an earlier relaxtion to go +# out of range, necessitating undo. Verify that relaxations witin +# _stop4 and _stop5 are undone. The progression of address assignments +# is detailed in the comments next to each instruction and alignment +# directive. "--" marks a relaxation followed by new address range. +# "++" prefixes an undone relaxation. +# +# RUN: echo 'SECTIONS { .text 0x100000 : { a.*.o } \ +# RUN: .tex2 0x200000 : { z.*.o } }' > undo.lds +# RUN: ld.lld -T undo.lds a.rv32.o z.rv32.o -o undo.rv32 +# RUN: ld.lld -T undo.lds a.rv64.o z.rv64.o -o undo.rv64 +# RUN: llvm-objdump -d undo.rv32 > undo.rv32.dis +# RUN: llvm-objdump -d undo.rv64 > undo.rv64.dis +# RUN: FileCheck %s < undo.rv32.dis +# RUN: FileCheck %s < undo.rv64.dis + +CHECK: <_start>: +CHECK-DAG: auipc ra, 256 +CHECK-DAG: jalr ra +CHECK-DAG: <_start1>: +CHECK-DAG: auipc ra, 256 +CHECK-DAG: jalr ra +CHECK-DAG: <_start2>: +CHECK-DAG: auipc ra, 256 +CHECK-DAG: jalr ra +CHECK-DAG: <_start3>: +CHECK-DAG: jal 0x200014 <_stop3> +CHECK-DAG: <_start4>: +CHECK-DAG: auipc ra, 256 +CHECK-DAG: jalr 4(ra) +CHECK-DAG: <_start5>: +CHECK-DAG: auipc ra, 256 +CHECK-DAG: jalr 4(ra) +CHECK-DAG: <_stop>: +CHECK-DAG: jal 0x100000 <_start> +CHECK-DAG: nop +CHECK-DAG: <_stop1>: +CHECK-DAG: jal 0x100008 <_start1> +CHECK-DAG: nop +CHECK-DAG: <_stop2>: +CHECK-DAG: jal 0x100010 <_start2> +CHECK-DAG: <_stop3>: +CHECK-DAG: jal 0x100018 <_start3> +CHECK-DAG: nop +CHECK-DAG: nop +CHECK-DAG: <_stop4>: +CHECK-DAG: auipc ra, 1048320 +CHECK-DAG: jalr -4(ra) +CHECK-DAG: <_stop5>: +CHECK-DAG: auipc ra, 1048320 +CHECK-DAG: jalr -4(ra) + + +##################### original pass 1 pass 2 + +#--- a.s +.text +.global _start, _start1, _start2, _start3, _start4, _start5 +_start: + call _stop # [0x00, 0x08) +_start1: + call _stop1 # [0x08, 0x10) +_start2: + call _stop2 # [0x10, 0x18) +_start3: + call _stop3 # [0x18, 0x20) -- [0x18, 0x1c) +_start4: + call _stop4 # [0x20, 0x28) [0x1c, 0x24) +_start5: + call _stop5 # [0x28, 0x30) [0x24, 0x2c) + +##################### original pass 1 pass 2 + +#--- z.s +.text +.global _stop, _stop1, _stop2, _stop3, _stop4, _stop5 +_stop: + call _start # [0x00, 0x08) -- [0x00, 0x04) + nop # [0x08, 0x0c) [0x04, 0x08) +_stop1: + call _start1 # [0x0c, 0x14) -- [0x08, 0x0c) +.balign 8 # [0x14, 0x18) [0x0c, 0x10) +_stop2: + call _start2 # [0x18, 0x20) -- [0x10, 0x14) +_stop3: + call _start3 # [0x20, 0x28) -- [0x14, 0x18) +.balign 16 # [0x28, 0x34) -- [0x18, 0x20) +_stop4: + call _start4 # [0x34, 0x38) -- [0x20, 0x24) ++ [0x20, 0x28) +_stop5: + call _start5 # [0x38, 0x44) -- [0x24, 0x28) ++ [0x28, 0x30) + +##################### original pass 1 pass 2 diff --git a/lld/test/ELF/riscv-relax-call.s b/lld/test/ELF/riscv-relax-call.s new file mode 100644 --- /dev/null +++ b/lld/test/ELF/riscv-relax-call.s @@ -0,0 +1,99 @@ +# REQUIRES: riscv +# RUN: rm -rf %t && mkdir %t && cd %t + +# RUN: llvm-mc -filetype=obj -triple=riscv32-unknown-elf -mattr=+relax %s -o rv32.o +# RUN: llvm-mc -filetype=obj -triple=riscv64-unknown-elf -mattr=+relax %s -o rv64.o +# RUN: llvm-mc -filetype=obj -triple=riscv32-unknown-elf -mattr=+c,+relax %s -o rv32c.o +# RUN: llvm-mc -filetype=obj -triple=riscv64-unknown-elf -mattr=+c,+relax %s -o rv64c.o + +# jal relaxation +# +# RUN: ld.lld rv32.o --defsym foo=_start+0x14 -o jal.rv32 +# RUN: ld.lld rv64.o --defsym foo=_start+0x14 -o jal.rv64 +# RUN: llvm-objdump -d -M no-aliases jal.rv32 > jal.rv32.dis +# RUN: llvm-objdump -d -M no-aliases jal.rv64 > jal.rv64.dis +# RUN: FileCheck --check-prefix=JAL %s < jal.rv32.dis +# RUN: FileCheck --check-prefix=JAL %s < jal.rv64.dis + +# Don't relax to c.j/c.jal if out of range +# +# RUN: ld.lld rv32c.o --defsym foo=_start+0x1004 -o nocj.rv32c +# RUN: ld.lld rv64c.o --defsym foo=_start+0x1004 -o nocj.rv64c +# RUN: llvm-objdump -d -M no-aliases nocj.rv32c > nocj.rv32c.dis +# RUN: llvm-objdump -d -M no-aliases nocj.rv64c > nocj.rv64c.dis +# RUN: FileCheck --check-prefix=JAL %s < nocj.rv32c.dis +# RUN: FileCheck --check-prefix=JAL %s < nocj.rv64c.dis + +# JAL: jal ra, {{.*}} +# JAL-NEXT: jal zero, {{.*}} + +# c.j and c.jal (RV32C-only) relaxation +# +# RUN: ld.lld rv32c.o --defsym foo=_start+0x14 -o cj.rv32c +# RUN: ld.lld rv64c.o --defsym foo=_start+0x14 -o cj.rv64c +# RUN: llvm-objdump -d -M no-aliases cj.rv32c > cj.rv32c.dis +# RUN: llvm-objdump -d -M no-aliases cj.rv64c > cj.rv64c.dis +# RUN: FileCheck --check-prefix=CJ32 %s < cj.rv32c.dis +# RUN: FileCheck --check-prefix=CJ64 %s < cj.rv64c.dis + +# Check relaxation works across output sections +# +# RUN: echo 'SECTIONS { .text 0x100000 : { *(.text) } \ +# RUN: .foo : ALIGN(8) { foo = .; } }' > xs.lds +# RUN: ld.lld -T xs.lds rv32c.o -o xs.rv32c +# RUN: ld.lld -T xs.lds rv64c.o -o xs.rv64c +# RUN: llvm-objdump -d -M no-aliases xs.rv32c > xs.rv32c.dis +# RUN: llvm-objdump -d -M no-aliases xs.rv64c > xs.rv64c.dis +# RUN: FileCheck --check-prefix=CJ32 %s < xs.rv32c.dis +# RUN: FileCheck --check-prefix=CJ64 %s < xs.rv64c.dis + +# CJ32: c.jal {{.*}} +# CJ32-NEXT: c.j {{.*}} +# CJ64: jal ra, {{.*}} +# CJ64-NEXT: c.j {{.*}} + +# Don't relax if out of range. +# (call is out of range, tail is within range) +# +# RUN: ld.lld rv32c.o --defsym foo=_start+0x100000 -o boundary.rv32c +# RUN: ld.lld rv64c.o --defsym foo=_start+0x100000 -o boundary.rv64c +# RUN: llvm-objdump -d -M no-aliases boundary.rv32c > boundary.rv32c.dis +# RUN: llvm-objdump -d -M no-aliases boundary.rv64c > boundary.rv64c.dis +# RUN: FileCheck --check-prefix=BOUNDARY %s < boundary.rv32c.dis +# RUN: FileCheck --check-prefix=BOUNDARY %s < boundary.rv64c.dis + +# Check that section alignment is factored-into the call displacement. +# (call is out of range, tail is within range) +# +# RUN: echo 'SECTIONS { .text 0x100000 : { *(.text) } \ +# RUN: .foo : ALIGN(0x100000) { foo = .; } }' > xalign.lds +# RUN: ld.lld -T xalign.lds rv32c.o -o xalign.rv32c +# RUN: ld.lld -T xalign.lds rv64c.o -o xalign.rv64c +# RUN: llvm-objdump -d -M no-aliases xalign.rv32c > xalign.rv32c.dis +# RUN: llvm-objdump -d -M no-aliases xalign.rv64c > xalign.rv64c.dis +# RUN: FileCheck --check-prefix=BOUNDARY %s < xalign.rv32c.dis +# RUN: FileCheck --check-prefix=BOUNDARY %s < xalign.rv64c.dis + +# BOUNDARY: auipc ra, 256 +# BOUNDARY-NEXT: jalr ra, 0(ra) +# BOUNDARY-NEXT: jal zero, {{.*}} + +# Don't relax to absolute symbols +# +# RUN: ld.lld rv32c.o -Ttext=0x100000 --defsym foo=0x100000 -o abs.rv32c +# RUN: ld.lld rv64c.o -Ttext=0x100000 --defsym foo=0x100000 -o abs.rv64c +# RUN: llvm-objdump -d -M no-aliases abs.rv32c > abs.rv32c.dis +# RUN: llvm-objdump -d -M no-aliases abs.rv64c > abs.rv64c.dis +# RUN: FileCheck --check-prefix=NORELAX %s < abs.rv32c.dis +# RUN: FileCheck --check-prefix=NORELAX %s < abs.rv64c.dis + +# NORELAX: auipc ra, {{.*}} +# NORELAX-NEXT: jalr ra, {{.*}}(ra) +# NORELAX: auipc t1, {{.*}} +# NORELAX-NEXT: jalr zero, {{.*}}(t1) + +.global _start +.p2align 3 +_start: + call foo + tail foo