Index: lib/Target/ARM/Thumb1FrameLowering.cpp =================================================================== --- lib/Target/ARM/Thumb1FrameLowering.cpp +++ lib/Target/ARM/Thumb1FrameLowering.cpp @@ -406,11 +406,15 @@ if (AFI->getArgRegsSaveSize()) return true; - bool IsV4PopReturn = false; + // FIXME: this doesn't make sense, and the following patch will remove it. + if (!STI.hasV4TOps()) return false; + + // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up. for (const CalleeSavedInfo &CSI : MF.getFrameInfo()->getCalleeSavedInfo()) if (CSI.getReg() == ARM::LR) - IsV4PopReturn = true; - return IsV4PopReturn && STI.hasV4TOps() && !STI.hasV5TOps(); + return true; + + return false; } bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, @@ -422,12 +426,45 @@ const ThumbRegisterInfo *RegInfo = static_cast(STI.getRegisterInfo()); - // When we need a special fix up for POP, this means that - // we either cannot use PC in POP or we have to update - // SP after poping the return address. - // In other words, we cannot use a pop {pc} like construction - // here, no matter what. + // If MBBI is a return instruction, or is a tPOP followed by a return + // instruction in the successor BB, we may be able to directly restore + // LR in the PC. + // This is only possible with v5T ops (v4T can't change the Thumb bit via + // a POP PC instruction), and only if we do not need to emit any SP update. + // Otherwise, we need a temporary register to pop the value + // and copy that value into LR. auto MBBI = MBB.getFirstTerminator(); + bool CanRestoreDirectly = STI.hasV5TOps() && !ArgRegsSaveSize; + if (CanRestoreDirectly) { + if (MBBI != MBB.end()) + CanRestoreDirectly = (MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::tPOP_RET); + else { + assert(MBB.back().getOpcode() == ARM::tPOP); + assert(MBB.succ_size() == 1); + if ((*MBB.succ_begin())->begin()->getOpcode() == ARM::tBX_RET) + MBBI--; // Replace the final tPOP with a tPOP_RET. + else + CanRestoreDirectly = false; + } + } + + if (CanRestoreDirectly) { + if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET) + return true; + MachineInstrBuilder MIB = + AddDefaultPred( + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET))); + // Copy implicit ops and popped registers, if any. + for (auto MO: MBBI->operands()) + if (MO.isReg() && (MO.isImplicit() || MO.isDef()) && + MO.getReg() != ARM::LR) + MIB.addOperand(MO); + MIB.addReg(ARM::PC, RegState::Define); + // Erase the old instruction (tBX_RET or tPOP). + MBB.erase(MBBI); + return true; + } // Look for a temporary register to use. // First, compute the liveness information. @@ -446,10 +483,10 @@ if (MBBI != MBB.end()) { dl = MBBI->getDebugLoc(); auto InstUpToMBBI = MBB.end(); - // The post-decrement is on purpose here. - // We want to have the liveness right before MBBI. - while (InstUpToMBBI-- != MBBI) - UsedRegs.stepBackward(*InstUpToMBBI); + while (InstUpToMBBI != MBBI) + // The pre-decrement is on purpose here. + // We want to have the liveness right before MBBI. + UsedRegs.stepBackward(*--InstUpToMBBI); } // Look for a register that can be directly use in the POP. @@ -495,6 +532,12 @@ .addReg(PopReg, RegState::Kill)); } + if (MBBI == MBB.end()) { + MachineInstr& Pop = MBB.back(); + assert(Pop.getOpcode() == ARM::tPOP); + Pop.RemoveOperand(Pop.findRegisterDefOperandIdx(ARM::LR)); + } + assert(PopReg && "Do not know how to get LR"); AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) .addReg(PopReg, RegState::Define); Index: test/CodeGen/Thumb/pop-special-fixup.ll =================================================================== --- /dev/null +++ test/CodeGen/Thumb/pop-special-fixup.ll @@ -0,0 +1,60 @@ +; RUN: llc %s -enable-shrink-wrap=true -o - | FileCheck %s + +target triple = "thumbv6m-none-none-eabi" + +@retval = global i32 0, align 4 + +define i32 @test(i32 %i, i32 %argc, i8** nocapture readonly %argv) { + %1 = icmp sgt i32 %argc, %i + br i1 %1, label %2, label %19 + + %3 = getelementptr inbounds i8*, i8** %argv, i32 %i + %4 = load i8*, i8** %3, align 4 + %5 = load i8, i8* %4, align 1 + %6 = icmp eq i8 %5, 45 + %7 = getelementptr inbounds i8, i8* %4, i32 1 + %. = select i1 %6, i8* %7, i8* %4 + %.1 = select i1 %6, i32 -1, i32 1 + %8 = load i8, i8* %., align 1 + %.off2 = add i8 %8, -48 + %9 = icmp ult i8 %.off2, 10 + %.pre = load i32, i32* @retval, align 4 + br i1 %9, label %.lr.ph.preheader, label %.critedge + +.lr.ph.preheader: ; preds = %2 + br label %.lr.ph + +.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph + %10 = phi i32 [ %14, %.lr.ph ], [ %.pre, %.lr.ph.preheader ] + %11 = phi i8 [ %15, %.lr.ph ], [ %8, %.lr.ph.preheader ] + %valstring.03 = phi i8* [ %13, %.lr.ph ], [ %., %.lr.ph.preheader ] + %12 = zext i8 %11 to i32 + %13 = getelementptr inbounds i8, i8* %valstring.03, i32 1 + %14 = add nsw i32 %10, %12 + store i32 %14, i32* @retval, align 4 + %15 = load i8, i8* %13, align 1 + %.off = add i8 %15, -48 + %16 = icmp ult i8 %.off, 10 + br i1 %16, label %.lr.ph, label %.critedge.loopexit + +.critedge.loopexit: ; preds = %.lr.ph + %.lcssa = phi i32 [ %14, %.lr.ph ] + br label %.critedge + +.critedge: ; preds = %.critedge.loopexit, %2 + %17 = phi i32 [ %.pre, %2 ], [ %.lcssa, %.critedge.loopexit ] + %18 = mul nsw i32 %17, %.1 + store i32 %18, i32* @retval, align 4 + br label %19 + +;