diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h --- a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h @@ -24,7 +24,8 @@ Pointer64, PCRel32, PCRel32GOTLoad, - PCRel32REXGOTLoad, + PCRel32GOTLoadRelaxable, + PCRel32REXGOTLoadRelaxable, PCRel64GOT, GOTOFF64, GOT64, diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp --- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp @@ -184,10 +184,11 @@ case ELF::R_X86_64_64: return ELF_x86_64_Edges::ELFX86RelocationKind::Pointer64; case ELF::R_X86_64_GOTPCREL: - case ELF::R_X86_64_GOTPCRELX: return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoad; + case ELF::R_X86_64_GOTPCRELX: + return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoadRelaxable; case ELF::R_X86_64_REX_GOTPCRELX: - return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32REXGOTLoad; + return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32REXGOTLoadRelaxable; case ELF::R_X86_64_GOTPCREL64: return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel64GOT; case ELF::R_X86_64_GOT64: @@ -301,15 +302,19 @@ Kind = x86_64::Pointer64; break; case PCRel32GOTLoad: { - Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable; - Addend = 0; + Kind = x86_64::RequestGOTAndTransformToDelta32; break; } - case PCRel32REXGOTLoad: { + case PCRel32REXGOTLoadRelaxable: { Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable; Addend = 0; break; } + case PCRel32GOTLoadRelaxable: { + Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable; + Addend = 0; + break; + } case PCRel64GOT: { Kind = x86_64::RequestGOTAndTransformToDelta64; break; @@ -498,7 +503,9 @@ return "PCRel32"; case PCRel32GOTLoad: return "PCRel32GOTLoad"; - case PCRel32REXGOTLoad: + case PCRel32GOTLoadRelaxable: + return "PCRel32GOTLoadRelaxable"; + case PCRel32REXGOTLoadRelaxable: return "PCRel32REXGOTLoad"; case PCRel64GOT: return "PCRel64GOT"; diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp --- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp @@ -73,42 +73,79 @@ LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n"); for (auto *B : G.blocks()) - for (auto &E : B->edges()) - if (E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) { - // Replace GOT load with LEA only for MOVQ instructions. - assert(E.getOffset() >= 3 && "GOT edge occurs too early in block"); - - constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b}; - if (strncmp(B->getContent().data() + E.getOffset() - 3, - reinterpret_cast(MOVQRIPRel), 2) != 0) - continue; - - auto &GOTBlock = E.getTarget().getBlock(); - assert(GOTBlock.getSize() == G.getPointerSize() && + for (auto &E : B->edges()) { + if (E.getKind() == x86_64::PCRel32GOTLoadRelaxable || + E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) { + bool REXPrefix = E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable; + assert(E.getOffset() >= (REXPrefix ? 3 : 2) && + "GOT edge occurs too early in block"); + auto *FixupData = reinterpret_cast( + const_cast(B->getContent().data())) + + E.getOffset(); + const uint8_t Op = FixupData[-2]; + const uint8_t ModRM = FixupData[-1]; + + auto &GOTEntryBlock = E.getTarget().getBlock(); + assert(GOTEntryBlock.getSize() == G.getPointerSize() && "GOT entry block should be pointer sized"); - assert(GOTBlock.edges_size() == 1 && + assert(GOTEntryBlock.edges_size() == 1 && "GOT entry should only have one outgoing edge"); - - auto &GOTTarget = GOTBlock.edges().begin()->getTarget(); - JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset(); + auto &GOTTarget = GOTEntryBlock.edges().begin()->getTarget(); JITTargetAddress TargetAddr = GOTTarget.getAddress(); - + JITTargetAddress EdgeAddr = B->getFixupAddress(E); int64_t Displacement = TargetAddr - EdgeAddr + 4; - if (isInRangeForImmS32(Displacement)) { - // Change the edge kind as we don't go through GOT anymore. This is - // for formal correctness only. Technically, the two relocation kinds - // are resolved the same way. + bool TargetInRangeForImmU32 = isInRangeForImmU32(TargetAddr); + bool DisplacementInRangeForImmS32 = isInRangeForImmS32(Displacement); + + // If both of the Target and displacement is out of range, then + // there isn't optimization chance. + if (!(TargetInRangeForImmU32 || DisplacementInRangeForImmS32)) + continue; + + // Transform "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg". + if (Op == 0x8b && DisplacementInRangeForImmS32) { + FixupData[-2] = 0x8d; E.setKind(x86_64::Delta32); E.setTarget(GOTTarget); E.setAddend(E.getAddend() - 4); - auto *BlockData = reinterpret_cast( - const_cast(B->getContent().data())); - BlockData[E.getOffset() - 2] = 0x8d; LLVM_DEBUG({ dbgs() << " Replaced GOT load wih LEA:\n "; printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind())); dbgs() << "\n"; }); + continue; + } + + // Transform call/jmp instructions + if (Op == 0xff && TargetInRangeForImmU32) { + if (ModRM == 0x15) { + // ABI says we can convert "call *foo@GOTPCREL(%rip)" to "nop; call + // foo" But lld convert it to "addr32 call foo, because that makes + // result expression to be a single instruction. + FixupData[-2] = 0x67; + FixupData[-1] = 0xe8; + LLVM_DEBUG({ + dbgs() << " replaced call instruction's memory operand wih imm " + "operand:\n "; + printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind())); + dbgs() << "\n"; + }); + } else { + // Transform "jmp *foo@GOTPCREL(%rip)" to "jmp foo; nop" + assert(ModRM == 0x25 && "Invalid ModRm for call/jmp instructions"); + FixupData[-2] = 0xe9; + FixupData[3] = 0x90; + E.setOffset(E.getOffset() - 1); + LLVM_DEBUG({ + dbgs() << " replaced jmp instruction's memory operand wih imm " + "operand:\n "; + printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind())); + dbgs() << "\n"; + }); + } + E.setKind(x86_64::Pointer32); + E.setTarget(GOTTarget); + continue; } } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubBypassable) { auto &StubBlock = E.getTarget().getBlock(); @@ -138,6 +175,7 @@ }); } } + } return Error::success(); } diff --git a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s --- a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s +++ b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s @@ -1,5 +1,5 @@ # RUN: rm -rf %t && mkdir -p %t -# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent -filetype=obj -o %t/elf_common.o %s +# RUN: llvm-mc -triple=x86_64-unknown-linux -relax-relocations=false -position-independent -filetype=obj -o %t/elf_common.o %s # RUN: llvm-jitlink -entry=load_common -noexec -check %s %t/elf_common.o .text diff --git a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s new file mode 100644 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s @@ -0,0 +1,63 @@ +# RUN: rm -rf %t && mkdir -p %t +# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent -filetype=obj \ +# RUN: -o %t/elf_sm_pic_reloc.o %s +# RUN: llvm-jitlink -noexec -slab-allocate 100Kb -slab-address 0xfff00000 \ +# RUN: -define-abs extern_in_range32=0xffe00000 \ +# RUN: -check %s %t/elf_sm_pic_reloc.o +# + + + .text + .file "testcase.c" + +# Empty main entry point. + .globl main + .p2align 4, 0x90 + .type main,@function +main: + retq + + .size main, .-main + +# Test optimization of transforming "call *foo@GOTPCREL(%rip)" to "addr call foo" +# We need check both the target address and the instruction opcodes +# jitlink-check: decode_operand(test_call_gotpcrelx, 0)[31:0] = extern_in_range32 +# jitlink-check: *{1}test_call_gotpcrelx = 0x67 +# jitlink-check: *{1}test_call_gotpcrelx+1 = 0xe8 + .globl test_call_gotpcrelx + .p2align 4, 0x90 + .type test_call_gotpcrelx,@function +test_call_gotpcrelx: + call *extern_in_range32@GOTPCREL(%rip) + + .size test_call_gotpcrelx, .-test_call_gotpcrelx + + +# Test optimization of transforming "jmp *foo@GOTPCREL(%rip)" to "jmp foo ; nop" +# We need check both the target address and the instruction opcodes +# jitlink-check: decode_operand(test_call_gotpcrelx, 0)[31:0] = extern_in_range32 +# jitlink-check: *{1}test_jmp_gotpcrelx = 0xe9 +# jitlink-check: *{1}test_jmp_gotpcrelx+5 = 0x90 + .globl test_jmp_gotpcrelx + .p2align 4, 0x90 + .type test_jmp_gotpcrelx,@function +test_jmp_gotpcrelx: + jmp *extern_in_range32@GOTPCREL(%rip) + + .size test_jmp_gotpcrelx, .-test_jmp_gotpcrelx + +# Check R_X86_64_PLT32 handling with a call to an external. This produces a +# Branch32ToStub edge, because externals are not defined locally. During +# resolution, the target turns out to be in-range from the callsite and so the +# edge is relaxed in post-allocation optimization. +# +# jitlink-check: decode_operand(test_call_extern, 0) = \ +# jitlink-check: extern_in_range32 - next_pc(test_call_extern) + .globl test_call_extern + .p2align 4, 0x90 + .type test_call_extern,@function +test_call_extern: + callq extern_in_range32@plt + + .size test_call_extern, .-test_call_extern + diff --git a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s --- a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s +++ b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s @@ -3,7 +3,6 @@ # RUN: -o %t/elf_sm_pic_reloc.o %s # RUN: llvm-jitlink -noexec -slab-allocate 100Kb -slab-address 0xfff00000 \ # RUN: -define-abs external_data=0x1 \ -# RUN: -define-abs extern_in_range32=0xffe00000 \ # RUN: -define-abs extern_out_of_range32=0x7fff00000000 \ # RUN: -check %s %t/elf_sm_pic_reloc.o # @@ -51,21 +50,6 @@ .size test_call_local, .-test_call_local -# Check R_X86_64_PLT32 handling with a call to an external. This produces a -# Branch32ToStub edge, because externals are not defined locally. During -# resolution, the target turns out to be in-range from the callsite and so the -# edge is relaxed in post-allocation optimization. -# -# jitlink-check: decode_operand(test_call_extern, 0) = \ -# jitlink-check: extern_in_range32 - next_pc(test_call_extern) - .globl test_call_extern - .p2align 4, 0x90 - .type test_call_extern,@function -test_call_extern: - callq extern_in_range32@plt - - .size test_call_extern, .-test_call_extern - # Check R_X86_64_PLT32 handling with a call to an external via PLT. This # produces a Branch32ToStub edge, because externals are not defined locally. # As the target is out-of-range from the callsite, the edge keeps using its PLT @@ -85,7 +69,9 @@ .size test_call_extern_plt, .-test_call_extern_plt # Test GOTPCREL handling. We want to check both the offset to the GOT entry and its -# contents. +# contents. "movl" will be optimized to "leal" and a non-got access if the pc relative +# offset to named_data is in range of 32 bits signed immediate. So use "leal" here to +# suppress optimization # jitlink-check: decode_operand(test_gotpcrel, 4) = \ # jitlink-check: got_addr(elf_sm_pic_reloc.o, named_data) - next_pc(test_gotpcrel) # jitlink-check: *{8}(got_addr(elf_sm_pic_reloc.o, named_data)) = named_data @@ -94,7 +80,7 @@ .p2align 4, 0x90 .type test_gotpcrel,@function test_gotpcrel: - movl named_data@GOTPCREL(%rip), %eax + leal named_data@GOTPCREL(%rip), %eax .size test_gotpcrel, .-test_gotpcrel