diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp --- a/llvm/lib/CodeGen/BranchRelaxation.cpp +++ b/llvm/lib/CodeGen/BranchRelaxation.cpp @@ -83,6 +83,8 @@ // The basic block after which trampolines are inserted. This is the last // basic block that isn't in the cold section. MachineBasicBlock *TrampolineInsertionPoint = nullptr; + SmallDenseSet> + RelaxedUnconditionals; std::unique_ptr RS; LivePhysRegs LiveRegs; @@ -148,7 +150,8 @@ if (MI.getOpcode() == TargetOpcode::FAULTING_OP) continue; MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI); - assert(isBlockInRange(MI, *DestBB)); + assert(isBlockInRange(MI, *DestBB) || + RelaxedUnconditionals.contains({&MBB, DestBB})); } } #endif @@ -170,7 +173,9 @@ void BranchRelaxation::scanFunction() { BlockInfo.clear(); BlockInfo.resize(MF->getNumBlockIDs()); + TrampolineInsertionPoint = nullptr; + RelaxedUnconditionals.clear(); // First thing, compute the size of all basic blocks, and see if the function // has any inline assembly in it. If so, we have to be conservative about @@ -562,6 +567,8 @@ BranchBB->sortUniqueLiveIns(); BranchBB->addSuccessor(DestBB); MBB->replaceSuccessor(DestBB, BranchBB); + if (TrampolineInsertionPoint == MBB) + TrampolineInsertionPoint = BranchBB; } DebugLoc DL = MI.getDebugLoc(); @@ -585,8 +592,28 @@ BlockInfo[BranchBB->getNumber()].Size = computeBlockSize(*BranchBB); adjustBlockOffsets(*MBB); - // If RestoreBB is required, try to place just before DestBB. + // If RestoreBB is required, place it appropriately. if (!RestoreBB->empty()) { + // If the jump is Cold -> Hot, don't place the restore block (which is + // cold) in the middle of the function. Place it at the end. + if (MBB->getSectionID() == MBBSectionID::ColdSectionID && + DestBB->getSectionID() != MBBSectionID::ColdSectionID) { + MachineBasicBlock *NewBB = createNewBlockAfter(*TrampolineInsertionPoint); + TII->insertUnconditionalBranch(*NewBB, DestBB, DebugLoc()); + BlockInfo[NewBB->getNumber()].Size = computeBlockSize(*NewBB); + + // New trampolines should be inserted after NewBB + TrampolineInsertionPoint = NewBB; + + // Retarget the unconditional branch to the trampoline block + BranchBB->replaceSuccessor(DestBB, NewBB); + NewBB->addSuccessor(DestBB); + + DestBB = NewBB; + } + + // In all other cases, try to place just before DestBB. + // TODO: For multiple far branches to the same destination, there are // chances that some restore blocks could be shared if they clobber the // same registers and share the same restore sequence. So far, those @@ -616,9 +643,11 @@ RestoreBB->setSectionID(DestBB->getSectionID()); RestoreBB->setIsBeginSection(DestBB->isBeginSection()); DestBB->setIsBeginSection(false); + RelaxedUnconditionals.insert({BranchBB, RestoreBB}); } else { // Remove restore block if it's not required. MF->erase(RestoreBB); + RelaxedUnconditionals.insert({BranchBB, DestBB}); } return true; @@ -644,7 +673,8 @@ // Unconditional branch destination might be unanalyzable, assume these // are OK. if (MachineBasicBlock *DestBB = TII->getBranchDestBlock(*Last)) { - if (!isBlockInRange(*Last, *DestBB) && !TII->isTailCall(*Last)) { + if (!isBlockInRange(*Last, *DestBB) && !TII->isTailCall(*Last) && + !RelaxedUnconditionals.contains({&MBB, DestBB})) { fixupUnconditionalBranch(*Last); ++NumUnconditionalRelaxed; Changed = true; @@ -724,6 +754,7 @@ LLVM_DEBUG(dbgs() << " Basic blocks after relaxation\n\n"; dumpBBs()); BlockInfo.clear(); + RelaxedUnconditionals.clear(); return MadeChange; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -283,30 +283,40 @@ }; RS->enterBasicBlockEnd(MBB); - Register Reg = RS->FindUnusedReg(&AArch64::GPR64RegClass); - - // If there's a free register, manually insert the indirect branch using it. - if (Reg != AArch64::NoRegister) { - buildIndirectBranch(Reg, NewDestBB); + // If X16 is unused, we can rely on the linker to insert a range extension + // thunk if NewDestBB is out of range of a single B instruction. + constexpr Register Reg = AArch64::X16; + if (!RS->isRegUsed(Reg)) { + insertUnconditionalBranch(MBB, &NewDestBB, DL); RS->setRegUsed(Reg); return; } - // Otherwise, spill and use X16. This briefly moves the stack pointer, making - // it incompatible with red zones. + // If there's a free register and it's worth inflating the code size, + // manually insert the indirect branch. + Register Scavenged = RS->FindUnusedReg(&AArch64::GPR64RegClass); + if (Scavenged != AArch64::NoRegister && + MBB.getSectionID() == MBBSectionID::ColdSectionID) { + buildIndirectBranch(Scavenged, NewDestBB); + RS->setRegUsed(Scavenged); + return; + } + + // Note: Spilling X16 briefly moves the stack pointer, making it incompatible + // with red zones. AArch64FunctionInfo *AFI = MBB.getParent()->getInfo(); if (!AFI || AFI->hasRedZone().value_or(true)) report_fatal_error( "Unable to insert indirect branch inside function that has red zone"); - Reg = AArch64::X16; + // Otherwise, spill X16 and defer range extension to the linker. BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre)) .addReg(AArch64::SP, RegState::Define) .addReg(Reg) .addReg(AArch64::SP) .addImm(-16); - buildIndirectBranch(Reg, RestoreBB); + BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB); BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost)) .addReg(AArch64::SP, RegState::Define) diff --git a/llvm/test/CodeGen/AArch64/branch-relax-b.ll b/llvm/test/CodeGen/AArch64/branch-relax-b.ll --- a/llvm/test/CodeGen/AArch64/branch-relax-b.ll +++ b/llvm/test/CodeGen/AArch64/branch-relax-b.ll @@ -6,9 +6,7 @@ ; CHECK-NEXT: tbnz w0, ; CHECK-SAME: LBB0_1 ; CHECK-NEXT: // %bb.3: // %entry -; CHECK-NEXT: adrp [[SCAVENGED_REGISTER:x[0-9]+]], .LBB0_2 -; CHECK-NEXT: add [[SCAVENGED_REGISTER]], [[SCAVENGED_REGISTER]], :lo12:.LBB0_2 -; CHECK-NEXT: br [[SCAVENGED_REGISTER]] +; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_1: // %iftrue ; CHECK-NEXT: //APP ; CHECK-NEXT: .zero 2048 @@ -44,9 +42,7 @@ ; CHECK-NEXT: // %bb.4: // %entry ; CHECK-NEXT: str [[SPILL_REGISTER:x[0-9]+]], [sp, ; CHECK-SAME: -16]! -; CHECK-NEXT: adrp [[SPILL_REGISTER]], .LBB1_5 -; CHECK-NEXT: add [[SPILL_REGISTER]], [[SPILL_REGISTER]], :lo12:.LBB1_5 -; CHECK-NEXT: br [[SPILL_REGISTER]] +; CHECK-NEXT: b .LBB1_5 ; CHECK-NEXT: .LBB1_1: // %iftrue ; CHECK-NEXT: //APP ; CHECK-NEXT: .zero 2048 @@ -137,23 +133,28 @@ define void @relax_b_x16_taken() { ; CHECK-LABEL: relax_b_x16_taken: // @relax_b_x16_taken -; COM: Pre-commit to record the behavior of relaxing an unconditional -; COM: branch across which x16 is taken. +; COM: Since the source of the out-of-range branch is hot and x16 is +; COM: taken, it makes sense to spill x16 and let the linker insert +; COM: fixup code for this branch rather than inflating the hot code +; COM: size by eagerly relaxing the unconditional branch. ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: //APP ; CHECK-NEXT: mov x16, #1 ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: cbnz x16, .LBB2_1 ; CHECK-NEXT: // %bb.3: // %entry -; CHECK-NEXT: adrp [[SCAVENGED_REGISTER2:x[0-9]+]], .LBB2_2 -; CHECK-NEXT: add [[SCAVENGED_REGISTER2]], [[SCAVENGED_REGISTER2]], :lo12:.LBB2_2 -; CHECK-NEXT: br [[SCAVENGED_REGISTER2]] +; CHECK-NEXT: str [[SPILL_REGISTER]], [sp, +; CHECK-SAME: -16]! +; CHECK-NEXT: b .LBB2_4 ; CHECK-NEXT: .LBB2_1: // %iftrue ; CHECK-NEXT: //APP ; CHECK-NEXT: .zero 2048 ; CHECK-NEXT: //NO_APP ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB2_2: // %iffalse +; CHECK-NEXT: .LBB2_4: // %iffalse +; CHECK-NEXT: ldr [[SPILL_REGISTER]], [sp], +; CHECK-SAME: 16 +; CHECK-NEXT: // %bb.2: // %iffalse ; CHECK-NEXT: //APP ; CHECK-NEXT: // reg use x16 ; CHECK-NEXT: //NO_APP @@ -174,4 +175,4 @@ } declare i32 @bar() -declare i32 @baz() \ No newline at end of file +declare i32 @baz() diff --git a/llvm/test/CodeGen/AArch64/branch-relax-cross-section.mir b/llvm/test/CodeGen/AArch64/branch-relax-cross-section.mir --- a/llvm/test/CodeGen/AArch64/branch-relax-cross-section.mir +++ b/llvm/test/CodeGen/AArch64/branch-relax-cross-section.mir @@ -384,12 +384,19 @@ ; INDIRECT: TBNZW ; INDIRECT-SAME: %bb.2 ; INDIRECT: [[TRAMP2]] - ; INDIRECT-NEXT: successors: %bb.3 + ; INDIRECT-NEXT: successors: %bb.6 ; INDIRECT: bb.2.end: ; INDIRECT: TCRETURNdi ; INDIRECT: [[TRAMP1]].entry: - ; INDIRECT: successors: %bb.3 - ; INDIRECT-NOT: bbsections Cold + ; INDIRECT-NEXT: successors: %[[TRAMP1_SPILL:bb.[0-9]+]] + ; INDIRECT: [[TRAMP1_SPILL]].entry: + ; INDIRECT-NEXT: successors: %[[TRAMP1_RESTORE:bb.[0-9]+]] + ; INDIRECT: early-clobber $sp = STRXpre $[[SPILL_REGISTER:x[0-9]+]], $sp, -16 + ; INDIRECT-NEXT: B %[[TRAMP1_RESTORE:bb.[0-9]+]] + ; INDIRECT: [[TRAMP1_RESTORE]].cold (bbsections Cold): + ; INDIRECT-NEXT: successors: %bb.3 + ; INDIRECT-NEXT: {{ $}} + ; INDIRECT-NEXT: early-clobber $sp, $[[SPILL_REGISTER]] = LDRXpost $sp, 16 ; INDIRECT: bb.3.cold (bbsections Cold): ; INDIRECT: TCRETURNdi @@ -433,26 +440,30 @@ hasRedZone: false body: | ; INDIRECT-LABEL: name: x16_used_cold_to_hot - ; COM: Pre-commit to record the behavior of relaxing a "cold-to-hot" - ; COM: unconditional branch across which x16 is taken but there is - ; COM: still a free register. + ; COM: Check that unconditional branches from the cold section to + ; COM: the hot section manually insert indirect branches if x16 + ; COM: isn't available but there is still a free register. ; INDIRECT: bb.0.entry: ; INDIRECT-NEXT: successors: %bb.1 ; INDIRECT-SAME: , %bb.3 ; INDIRECT: TBZW killed renamable $w8, 0, %bb.1 ; INDIRECT-NEXT: {{ $}} ; INDIRECT-NEXT: bb.3.entry: - ; INDIRECT-NEXT: successors: %bb.2 + ; INDIRECT-NEXT: successors: %bb.4 ; INDIRECT-NEXT: liveins: $x16 ; INDIRECT-NEXT: {{ $}} - ; INDIRECT-NEXT: $[[SCAVENGED_REGISTER3:x[0-9]+]] = ADRP target-flags(aarch64-page) - ; INDIRECT-NEXT: $[[SCAVENGED_REGISTER3]] = ADDXri $[[SCAVENGED_REGISTER3]], target-flags(aarch64-pageoff, aarch64-nc) , 0 - ; INDIRECT-NEXT: BR $[[SCAVENGED_REGISTER3]] + ; INDIRECT-NEXT: early-clobber $sp = STRXpre $[[SPILL_REGISTER]], $sp, -16 + ; INDIRECT-NEXT: B %bb.4 ; INDIRECT: bb.1.hot: ; INDIRECT-NEXT: liveins: $x16 ; INDIRECT: killed $x16 ; INDIRECT: RET undef $lr - ; INDIRECT: bb.2.cold (bbsections Cold): + ; INDIRECT: bb.4.cold (bbsections Cold): + ; INDIRECT-NEXT: successors: %bb.2 + ; INDIRECT-NEXT: {{ $}} + ; INDIRECT-NEXT: early-clobber $sp, $[[SPILL_REGISTER]] = LDRXpost $sp, 16 + ; INDIRECT-NEXT: {{ $}} + ; INDIRECT-NEXT: bb.2.cold (bbsections Cold): ; INDIRECT-NEXT: successors: %bb.5 ; INDIRECT-NEXT: liveins: $x16 ; INDIRECT-NEXT: {{ $}} @@ -462,9 +473,9 @@ ; INDIRECT-NEXT: successors: %bb.1 ; INDIRECT-NEXT: liveins: $x16 ; INDIRECT-NEXT: {{ $}} - ; INDIRECT-NEXT: $[[SCAVENGED_REGISTER4:x[0-9]+]] = ADRP target-flags(aarch64-page) - ; INDIRECT-NEXT: $[[SCAVENGED_REGISTER4]] = ADDXri $[[SCAVENGED_REGISTER4]], target-flags(aarch64-pageoff, aarch64-nc) , 0 - ; INDIRECT-NEXT: BR $[[SCAVENGED_REGISTER4]] + ; INDIRECT-NEXT: $[[SCAVENGED_REGISTER:x[0-9]+]] = ADRP target-flags(aarch64-page) + ; INDIRECT-NEXT: $[[SCAVENGED_REGISTER]] = ADDXri $[[SCAVENGED_REGISTER]], target-flags(aarch64-pageoff, aarch64-nc) , 0 + ; INDIRECT-NEXT: BR $[[SCAVENGED_REGISTER]] bb.0.entry: successors: %bb.1, %bb.2 @@ -532,8 +543,10 @@ hasRedZone: false body: | ; INDIRECT-LABEL: name: all_used_cold_to_hot - ; COM: Pre-commit to record the behavior of relaxing a "cold-to-hot" - ; COM: unconditional branch across which there are no free registers. + ; COM: Check that unconditional branches from the cold section to + ; COM: the hot section spill x16 and defer indirect branch + ; COM: insertion to the linker if there are no free general-purpose + ; COM: registers. ; INDIRECT: bb.0.entry: ; INDIRECT-NEXT: successors: %bb.3 ; INDIRECT-NEXT: liveins: $fp, $x27, $x28, $x25, $x26, $x23, $x24, $x21, $x22, $x19, $x20 @@ -545,17 +558,7 @@ ; INDIRECT-SAME: $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x17, $x18, $x19, ; INDIRECT-SAME: $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28 ; INDIRECT-NEXT: {{ $}} - ; INDIRECT-NEXT: $[[SCAVENGED_REGISTER5:x[0-9]+]] = ADRP target-flags(aarch64-page) - ; INDIRECT-NEXT: $[[SCAVENGED_REGISTER5]] = ADDXri $[[SCAVENGED_REGISTER5]], target-flags(aarch64-pageoff, aarch64-nc) , 0 - ; INDIRECT-NEXT: BR $[[SCAVENGED_REGISTER5]] - ; INDIRECT-NEXT: {{ $}} - ; INDIRECT-NEXT: bb.6.exit: - ; INDIRECT-NEXT: successors: %bb.1 - ; INDIRECT-NEXT: liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, - ; INDIRECT-SAME: $x10, $x11, $x12, $x13, $x14, $x15, $x17, $x18, $x19, $x20, - ; INDIRECT-SAME: $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28 - ; INDIRECT-NEXT: {{ $}} - ; INDIRECT-NEXT: early-clobber $sp, $[[SPILL_REGISTER:x[0-9]+]] = LDRXpost $sp, 16 + ; INDIRECT-NEXT: B %bb.2 ; INDIRECT-NEXT: {{ $}} ; INDIRECT-NEXT: bb.1.exit: ; INDIRECT-NEXT: liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, @@ -565,6 +568,16 @@ ; INDIRECT-COUNT-30: INLINEASM &"# reg use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, killed ; INDIRECT: RET undef $lr ; INDIRECT-NEXT: {{ $}} + ; INDIRECT-NEXT: bb.6.exit: + ; INDIRECT-NEXT: successors: %bb.7(0x80000000) + ; INDIRECT-NEXT: {{ $}} + ; INDIRECT-NEXT: early-clobber $sp, $[[SPILL_REGISTER]] = LDRXpost $sp, 16 + ; INDIRECT-NEXT: {{ $}} + ; INDIRECT-NEXT: bb.7.exit: + ; INDIRECT-NEXT: successors: %bb.1(0x80000000) + ; INDIRECT-NEXT: {{ $}} + ; INDIRECT-NEXT: B %bb.1 + ; INDIRECT-NEXT: {{ $}} ; INDIRECT-NEXT: bb.2.cold (bbsections Cold): ; INDIRECT-NEXT: successors: %bb.5 ; INDIRECT-NEXT: liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, @@ -580,9 +593,7 @@ ; INDIRECT-SAME: $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28 ; INDIRECT-NEXT: {{ $}} ; INDIRECT-NEXT: early-clobber $sp = STRXpre $[[SPILL_REGISTER]], $sp, -16 - ; INDIRECT-NEXT: $[[SPILL_REGISTER]] = ADRP target-flags(aarch64-page) - ; INDIRECT-NEXT: $[[SPILL_REGISTER]] = ADDXri $[[SPILL_REGISTER]], target-flags(aarch64-pageoff, aarch64-nc) , 0 - ; INDIRECT-NEXT: BR $[[SPILL_REGISTER]] + ; INDIRECT-NEXT: B %bb.6 bb.0.entry: successors: %bb.2