diff --git a/lld/MachO/ConcatOutputSection.cpp b/lld/MachO/ConcatOutputSection.cpp --- a/lld/MachO/ConcatOutputSection.cpp +++ b/lld/MachO/ConcatOutputSection.cpp @@ -222,6 +222,11 @@ size_t thunkCallCount = 0; size_t thunkCount = 0; + // Walk all sections in order. Finalize all sections that are less than + // forwardBranchRange in front of it. + // isecVA is the address of the current section. + // isecAddr is the start address of the first non-finalized section. + // inputs[finalIdx] is for finalization (address-assignment) size_t finalIdx = 0; // Kick-off by ensuring that the first input section has an address @@ -232,12 +237,22 @@ ConcatInputSection *isec = inputs[callIdx]; assert(isec->isFinal); uint64_t isecVA = isec->getVA(); - // Assign addresses up-to the forward branch-range limit + + // Assign addresses up-to the forward branch-range limit. + // Every call instruction needs a small number of bytes (on Arm64: 4), + // and each inserted thunk needs a slighly larger number of bytes + // (on Arm64: 12). If a section starts with a branch instruction and + // contains several branch instructions in succession, then the distance + // from the current position to the position where the thunks are inserted + // grows. So leave room for a bunch of thunks. + unsigned slop = 100 * thunkSize; while (finalIdx < endIdx && isecAddr + inputs[finalIdx]->getSize() < - isecVA + forwardBranchRange - thunkSize) + isecVA + forwardBranchRange - slop) finalizeOne(inputs[finalIdx++]); + if (isec->callSiteCount == 0) continue; + if (finalIdx == endIdx && stubsInRangeVA == TargetInfo::outOfRangeVA) { // When we have finalized all input sections, __stubs (destined // to follow __text) comes within range of forward branches and @@ -293,13 +308,10 @@ } // ... otherwise, create a new thunk. if (isecAddr > highVA) { - // When there is small-to-no margin between highVA and - // isecAddr and the distance between subsequent call sites is - // smaller than thunkSize, then a new thunk can go out of - // range. Fix by unfinalizing inputs[finalIdx] to reduce the - // distance between callVA and highVA, then shift some thunks - // to occupy address-space formerly occupied by the - // unfinalized inputs[finalIdx]. + // There were too many consecutive branch instructions for `slop` + // above. If you hit this: For the current algorithm, just bumping up + // slop above and trying again is probably simplest. (See also PR51578 + // comment 5). fatal(Twine(__FUNCTION__) + ": FIXME: thunk range overrun"); } thunkInfo.isec = diff --git a/lld/test/MachO/arm64-thunk-starvation.s b/lld/test/MachO/arm64-thunk-starvation.s new file mode 100644 --- /dev/null +++ b/lld/test/MachO/arm64-thunk-starvation.s @@ -0,0 +1,57 @@ +# REQUIRES: aarch64 +# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %s -o %t.o +# RUN: %lld -arch arm64 -lSystem -o %t.out %t.o + +## Regression test for PR51578. + +.subsections_via_symbols + +.globl _f1, _f2, _f3, _f4, _f5, _f6 +.p2align 2 +_f1: b _fn1 +_f2: b _fn2 +_f3: b _fn3 +_f4: b _fn4 +_f5: b _fn5 +_f6: b _fn6 +## 6 * 4 = 24 bytes for branches +## Currently leaves 12 bytes for one thunk, so 36 bytes. +## Uses < instead of <=, so 40 bytes. + +.global _spacer1, _spacer1 +## 0x8000000 is 128 MiB, one more than the forward branch limit, +## distributed over two functions since our thunk insertion algorithm +## can't deal with a single function that's 128 MiB. +## We leave just enough room so that the old thunking algorithm finalized +## both spacers when processing _f1 (24 bytes for the 4 bytes code for each +## of the 6 _f functions, 12 bytes for one thunk, 4 bytes because the forward +## branch range is 128 Mib - 4 bytes, and another 4 bytes because the algorithm +## uses `<` instead of `<=`, for a total of 44 bytes slop.) Of the slop, 20 +## bytes are actually room for thunks. +## _fn1-_fn6 aren't finalized because then there wouldn't be room for a thunk. +## But when a thunk is inserted to jump from _f1 to _fn1, that needs 12 bytes +## but _f2 is only 4 bytes later, so after _f1 there are only +## 20-(12-4) = 12 bytes left, after _f2 only 12-(12-4) 4 bytes, and after +## _f3 there's no more room for thunks and we can't make progress. +## The fix is to leave room for many more thunks. +## The same construction as this test case can defeat that too with enough +## consecutive jumps, but in practice there aren't hundreds of consecutive +## jump instructions. + +_spacer1: +.space 0x4000000 +_spacer2: +.space 0x4000000 - 44 + +.globl _fn1, _fn2, _fn3, _fn4, _fn5, _fn6 +.p2align 2 +_fn1: ret +_fn2: ret +_fn3: ret +_fn4: ret +_fn5: ret +_fn6: ret + +.globl _main +_main: + ret