diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -370,29 +370,35 @@ const ARMFunctionInfo *AFI = MF.getInfo(); const ARMFrameLowering *TFI = getFrameLowering(MF); - // When outgoing call frames are so large that we adjust the stack pointer - // around the call, we can no longer use the stack pointer to reach the - // emergency spill slot. + // If we have stack realignment and VLAs, we have no pointer to use to + // access the stack. If we have stack realignment, and a large call frame, + // we have no place to allocate the emergency spill slot. if (needsStackRealignment(MF) && !TFI->hasReservedCallFrame(MF)) return true; // Thumb has trouble with negative offsets from the FP. Thumb2 has a limited // negative range for ldr/str (255), and thumb1 is positive offsets only. + // // It's going to be better to use the SP or Base Pointer instead. When there // are variable sized objects, we can't reference off of the SP, so we // reserve a Base Pointer. - if (AFI->isThumbFunction() && MFI.hasVarSizedObjects()) { - // Conservatively estimate whether the negative offset from the frame - // pointer will be sufficient to reach. If a function has a smallish - // frame, it's less likely to have lots of spills and callee saved - // space, so it's all more likely to be within range of the frame pointer. - // If it's wrong, the scavenger will still enable access to work, it just - // won't be optimal. - if (AFI->isThumb2Function() && MFI.getLocalFrameSize() < 128) - return false; + // + // For Thumb2, estimate whether a negative offset from the frame pointer + // will be sufficient to reach the whole stack frame. If a function has a + // smallish frame, it's less likely to have lots of spills and callee saved + // space, so it's all more likely to be within range of the frame pointer. + // If it's wrong, the scavenger will still enable access to work, it just + // won't be optimal. (We should always be able to reach the emergency + // spill slot from the frame pointer.) + if (AFI->isThumb2Function() && MFI.hasVarSizedObjects() && + MFI.getLocalFrameSize() >= 128) + return true; + // For Thumb1, if sp moves, nothing is in range, so force a base pointer. + // This is necessary for correctness in cases where we need an emergency + // spill slot. (In Thumb1, we can't use a negative offset from the frame + // pointer.) + if (AFI->isThumb1OnlyFunction() && !TFI->hasReservedCallFrame(MF)) return true; - } - return false; } diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -344,6 +344,10 @@ /// as assignCalleeSavedSpillSlots() hasn't run at this point. Instead we use /// this to produce a conservative estimate that we check in an assert() later. static int getMaxFPOffset(const Function &F, const ARMFunctionInfo &AFI) { + // For Thumb1, push.w isn't available, so the first push will always push + // r7 and lr onto the stack first. + if (AFI.isThumb1OnlyFunction()) + return -AFI.getArgRegsSaveSize() - (2 * 4); // This is a conservative estimation: Assume the frame pointer being r7 and // pc("r15") up to r8 getting spilled before (= 8 registers). return -AFI.getArgRegsSaveSize() - (8 * 4); @@ -954,8 +958,12 @@ } } // Use the base pointer if we have one. - if (RegInfo->hasBasePointer(MF)) + // FIXME: Maybe prefer sp on Thumb1 if it's legal and the offset is cheaper? + // That can happen if we forced a base pointer for a large call frame. + if (RegInfo->hasBasePointer(MF)) { FrameReg = RegInfo->getBaseRegister(); + Offset -= SPAdj; + } return Offset; } @@ -1775,13 +1783,59 @@ } EstimatedStackSize += 16; // For possible paddings. - unsigned EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this); + unsigned EstimatedRSStackSizeLimit, EstimatedRSFixedSizeLimit; + if (AFI->isThumb1OnlyFunction()) { + // For Thumb1, don't bother to iterate over the function. The only + // instruction that requires an emergency spill slot is a store to a + // frame index. + // + // tSTRspi, which is used for sp-relative accesses, has an 8-bit unsigned + // immediate. tSTRi, which is used for bp- and fp-relative accesses, has + // a 5-bit unsigned immediate. + // + // We could try to check if the function actually contains a tSTRspi + // that might need the spill slot, but it's not really important. + // Functions with VLAs or extremely large call frames are rare, and + // if a function is allocating more than 1KB of stack, an extra 4-byte + // slot probably isn't relevant. + if (RegInfo->hasBasePointer(MF)) + EstimatedRSStackSizeLimit = (1U << 5) * 4; + else + EstimatedRSStackSizeLimit = (1U << 8) * 4; + EstimatedRSFixedSizeLimit = (1U << 5) * 4; + } else { + EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this); + EstimatedRSFixedSizeLimit = EstimatedRSStackSizeLimit; + } + // Final estimate of whether sp or bp-relative accesses might require + // scavenging. + bool HasLargeStack = EstimatedStackSize > EstimatedRSStackSizeLimit; + + // If the stack pointer moves and we don't have a base pointer, the + // estimate logic doesn't work. The actual offsets might be larger when + // we're constructing a call frame, or we might need to use negative + // offsets from fp. + bool HasMovingSP = MFI.hasVarSizedObjects() || + (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF)); + bool HasBPOrFixedSP = RegInfo->hasBasePointer(MF) || !HasMovingSP; + + // If we have a frame pointer, we assume arguments will be accessed + // relative to the frame pointer. Check whether fp-relative accesses to + // arguments require scavenging. + // + // We could do slightly better on Thumb1; in some cases, an sp-relative + // offset would be legal even though an fp-relative offset is not. int MaxFPOffset = getMaxFPOffset(MF.getFunction(), *AFI); - bool BigFrameOffsets = EstimatedStackSize >= EstimatedRSStackSizeLimit || - MFI.hasVarSizedObjects() || - (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF)) || - // For large argument stacks fp relative addressed may overflow. - (HasFP && (MaxFixedOffset - MaxFPOffset) >= (int)EstimatedRSStackSizeLimit); + bool HasLargeArgumentList = + HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit; + + bool BigFrameOffsets = HasLargeStack || !HasBPOrFixedSP || + HasLargeArgumentList; + LLVM_DEBUG(dbgs() << "EstimatedLimit: " << EstimatedRSStackSizeLimit + << "; EstimatedStack" << EstimatedStackSize + << "; EstimatedFPStack" << MaxFixedOffset - MaxFPOffset + << "; BigFrameOffsets: " << BigFrameOffsets + << "\n"); if (BigFrameOffsets || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { AFI->setHasStackFrame(true); @@ -1806,8 +1860,17 @@ CS1Spilled = true; } - // This is true when we inserted a spill for an unused register that can now - // be used for register scavenging. + // This is true when we inserted a spill for a callee-save GPR which is + // not otherwise used by the function. This guaranteees it is possible + // to scavenge a register to hold the address of a stack slot. On Thumb1, + // the register must be a valid operand to tSTRi, i.e. r4-r7. For other + // subtargets, this is any GPR, i.e. r4-r11 or lr. + // + // If we don't insert a spill, we instead allocate an emergency spill + // slot, which can be used by scavenging to spill an arbitrary register. + // + // We currently don't try to figure out whether any specific instruction + // requires scavening an additional register. bool ExtraCSSpill = false; if (AFI->isThumb1OnlyFunction()) { @@ -1916,7 +1979,7 @@ NumGPRSpills++; CS1Spilled = true; assert(!MRI.isReserved(Reg) && "Should not be reserved"); - if (!MRI.isPhysRegUsed(Reg)) + if (Reg != ARM::LR && !MRI.isPhysRegUsed(Reg)) ExtraCSSpill = true; UnspilledCS1GPRs.erase(llvm::find(UnspilledCS1GPRs, Reg)); if (Reg == ARM::LR) @@ -1941,7 +2004,8 @@ UnspilledCS1GPRs.erase(LRPos); ForceLRSpill = false; - if (!MRI.isReserved(ARM::LR) && !MRI.isPhysRegUsed(ARM::LR)) + if (!MRI.isReserved(ARM::LR) && !MRI.isPhysRegUsed(ARM::LR) && + !AFI->isThumb1OnlyFunction()) ExtraCSSpill = true; } @@ -1963,7 +2027,8 @@ SavedRegs.set(Reg); LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, TRI) << " to make up alignment\n"); - if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg)) + if (!MRI.isReserved(Reg) && !MRI.isPhysRegUsed(Reg) && + !(Reg == ARM::LR && AFI->isThumb1OnlyFunction())) ExtraCSSpill = true; break; } @@ -1992,8 +2057,7 @@ unsigned Reg = UnspilledCS1GPRs.back(); UnspilledCS1GPRs.pop_back(); if (!MRI.isReserved(Reg) && - (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) || - Reg == ARM::LR)) { + (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg))) { Extras.push_back(Reg); NumExtras--; } @@ -2016,10 +2080,10 @@ ExtraCSSpill = true; } } - if (!ExtraCSSpill && !AFI->isThumb1OnlyFunction()) { - // note: Thumb1 functions spill to R12, not the stack. Reserve a slot - // closest to SP or frame pointer. + if (!ExtraCSSpill) { + // Reserve a slot closest to SP or frame pointer. assert(RS && "Register scavenging not provided"); + LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n"); const TargetRegisterClass &RC = ARM::GPRRegClass; unsigned Size = TRI->getSpillSize(RC); unsigned Align = TRI->getSpillAlignment(RC); diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1150,15 +1150,22 @@ if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) { Base = N.getOperand(0); int FI = cast(Base)->getIndex(); - // For LHS+RHS to result in an offset that's a multiple of 4 the object - // indexed by the LHS must be 4-byte aligned. + // Make sure the offset is inside the object, or we might fail to + // allocate an emergency spill slot. (An out-of-range access is UB, but + // it could show up anyway.) MachineFrameInfo &MFI = MF->getFrameInfo(); - if (MFI.getObjectAlignment(FI) < 4) - MFI.setObjectAlignment(FI, 4); - Base = CurDAG->getTargetFrameIndex( - FI, TLI->getPointerTy(CurDAG->getDataLayout())); - OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); - return true; + if (RHSC * 4 < MFI.getObjectSize(FI)) { + // For LHS+RHS to result in an offset that's a multiple of 4 the object + // indexed by the LHS must be 4-byte aligned. + if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlignment(FI) < 4) + MFI.setObjectAlignment(FI, 4); + if (MFI.getObjectAlignment(FI) >= 4) { + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); + return true; + } + } } } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -3652,7 +3652,8 @@ // argument passed via stack. int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(), - CCInfo.getNextStackOffset(), 4); + CCInfo.getNextStackOffset(), + std::max(4U, TotalArgRegsSaveSize)); AFI->setVarArgsFrameIndex(FrameIndex); } diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp --- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -63,15 +63,52 @@ return !MFI.hasVarSizedObjects(); } -static void emitSPUpdate(MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - const TargetInstrInfo &TII, const DebugLoc &dl, - const ThumbRegisterInfo &MRI, int NumBytes, - unsigned MIFlags = MachineInstr::NoFlags) { +static void +emitPrologueEpilogueSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetInstrInfo &TII, const DebugLoc &dl, + const ThumbRegisterInfo &MRI, int NumBytes, + unsigned ScratchReg, unsigned MIFlags) { + // If it would take more than three instructions to adjust the stack pointer + // using tADDspi/tSUBspi, load an immediate instead. + if (std::abs(NumBytes) > 508 * 3) { + // We use a different codepath here from the normal + // emitThumbRegPlusImmediate so we don't have to deal with register + // scavenging. (Scavenging could try to use the emergency spill slot + // before we've actually finished setting up the stack.) + if (ScratchReg == ARM::NoRegister) + report_fatal_error("Failed to emit Thumb1 stack adjustment"); + MachineFunction &MF = *MBB.getParent(); + const ARMSubtarget &ST = MF.getSubtarget(); + if (ST.genExecuteOnly()) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ScratchReg) + .addImm(NumBytes).setMIFlags(MIFlags); + } else { + MRI.emitLoadConstPool(MBB, MBBI, dl, ScratchReg, 0, NumBytes, ARMCC::AL, + 0, MIFlags); + } + BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDhirr), ARM::SP) + .addReg(ARM::SP).addReg(ScratchReg, RegState::Kill) + .add(predOps(ARMCC::AL)); + return; + } + // FIXME: This is assuming the heuristics in emitThumbRegPlusImmediate + // won't change. emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII, MRI, MIFlags); + } +static void emitCallSPUpdate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetInstrInfo &TII, const DebugLoc &dl, + const ThumbRegisterInfo &MRI, int NumBytes, + unsigned MIFlags = MachineInstr::NoFlags) { + emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII, + MRI, MIFlags); +} + + MachineBasicBlock::iterator Thumb1FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { @@ -95,10 +132,10 @@ // Replace the pseudo instruction with a new instruction... unsigned Opc = Old.getOpcode(); if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) { - emitSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount); + emitCallSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount); } else { assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP); - emitSPUpdate(MBB, I, TII, dl, *RegInfo, Amount); + emitCallSPUpdate(MBB, I, TII, dl, *RegInfo, Amount); } } } @@ -141,8 +178,8 @@ int FramePtrSpillFI = 0; if (ArgRegsSaveSize) { - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize, - MachineInstr::FrameSetup); + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize, + ARM::NoRegister, MachineInstr::FrameSetup); CFAOffset -= ArgRegsSaveSize; unsigned CFIIndex = MF.addFrameInst( MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); @@ -153,8 +190,9 @@ if (!AFI->hasStackFrame()) { if (NumBytes - ArgRegsSaveSize != 0) { - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -(NumBytes - ArgRegsSaveSize), - MachineInstr::FrameSetup); + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, + -(NumBytes - ArgRegsSaveSize), + ARM::NoRegister, MachineInstr::FrameSetup); CFAOffset -= NumBytes - ArgRegsSaveSize; unsigned CFIIndex = MF.addFrameInst( MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); @@ -331,8 +369,20 @@ if (NumBytes) { // Insert it after all the callee-save spills. - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes, - MachineInstr::FrameSetup); + // + // For a large stack frame, we might need a scratch register to store + // the size of the frame. We know all callee-save registers are free + // at this point in the prologue, so pick one. + unsigned ScratchRegister = ARM::NoRegister; + for (auto &I : CSI) { + unsigned Reg = I.getReg(); + if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) { + ScratchRegister = Reg; + break; + } + } + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes, + ScratchRegister, MachineInstr::FrameSetup); if (!HasFP) { CFAOffset -= NumBytes; unsigned CFIIndex = MF.addFrameInst( @@ -437,7 +487,9 @@ if (!AFI->hasStackFrame()) { if (NumBytes - ArgRegsSaveSize != 0) - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes - ArgRegsSaveSize); + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, + NumBytes - ArgRegsSaveSize, ARM::NoRegister, + MachineInstr::NoFlags); } else { // Unwind MBBI to point to first LDR / VLDRD. if (MBBI != MBB.begin()) { @@ -472,13 +524,27 @@ .addReg(FramePtr) .add(predOps(ARMCC::AL)); } else { + // For a large stack frame, we might need a scratch register to store + // the size of the frame. We know all callee-save registers are free + // at this point in the epilogue, so pick one. + unsigned ScratchRegister = ARM::NoRegister; + bool HasFP = hasFP(MF); + for (auto &I : MFI.getCalleeSavedInfo()) { + unsigned Reg = I.getReg(); + if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) { + ScratchRegister = Reg; + break; + } + } if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET && &MBB.front() != &*MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) { MachineBasicBlock::iterator PMBBI = std::prev(MBBI); if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*PMBBI, NumBytes)) - emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes); + emitPrologueEpilogueSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes, + ScratchRegister, MachineInstr::NoFlags); } else if (!tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes)) - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes); + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes, + ScratchRegister, MachineInstr::NoFlags); } } @@ -665,7 +731,9 @@ // Advance past the pop instruction. MBBI++; // Increment the SP. - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize + 4); + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, + ArgRegsSaveSize + 4, ARM::NoRegister, + MachineInstr::NoFlags); return true; } @@ -706,7 +774,8 @@ .add(predOps(ARMCC::AL)) .addReg(PopReg, RegState::Define); - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); + emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize, + ARM::NoRegister, MachineInstr::NoFlags); BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) .addReg(ARM::LR, RegState::Define) diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.h b/llvm/lib/Target/ARM/ThumbRegisterInfo.h --- a/llvm/lib/Target/ARM/ThumbRegisterInfo.h +++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.h @@ -51,14 +51,10 @@ const ARMBaseInstrInfo &TII) const; void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, int64_t Offset) const override; - bool saveScavengerRegister(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - MachineBasicBlock::iterator &UseMI, - const TargetRegisterClass *RC, - unsigned Reg) const override; void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; + bool useFPForScavengingIndex(const MachineFunction &MF) const override; }; } diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp --- a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp @@ -446,63 +446,6 @@ (void)Done; } -/// saveScavengerRegister - Spill the register so it can be used by the -/// register scavenger. Return true. -bool ThumbRegisterInfo::saveScavengerRegister( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - MachineBasicBlock::iterator &UseMI, const TargetRegisterClass *RC, - unsigned Reg) const { - - const ARMSubtarget &STI = MBB.getParent()->getSubtarget(); - if (!STI.isThumb1Only()) - return ARMBaseRegisterInfo::saveScavengerRegister(MBB, I, UseMI, RC, Reg); - - // Thumb1 can't use the emergency spill slot on the stack because - // ldr/str immediate offsets must be positive, and if we're referencing - // off the frame pointer (if, for example, there are alloca() calls in - // the function, the offset will be negative. Use R12 instead since that's - // a call clobbered register that we know won't be used in Thumb1 mode. - const TargetInstrInfo &TII = *STI.getInstrInfo(); - DebugLoc DL; - BuildMI(MBB, I, DL, TII.get(ARM::tMOVr)) - .addReg(ARM::R12, RegState::Define) - .addReg(Reg, RegState::Kill) - .add(predOps(ARMCC::AL)); - - // The UseMI is where we would like to restore the register. If there's - // interference with R12 before then, however, we'll need to restore it - // before that instead and adjust the UseMI. - bool done = false; - for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) { - if (II->isDebugInstr()) - continue; - // If this instruction affects R12, adjust our restore point. - for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = II->getOperand(i); - if (MO.isRegMask() && MO.clobbersPhysReg(ARM::R12)) { - UseMI = II; - done = true; - break; - } - if (!MO.isReg() || MO.isUndef() || !MO.getReg() || - TargetRegisterInfo::isVirtualRegister(MO.getReg())) - continue; - if (MO.getReg() == ARM::R12) { - UseMI = II; - done = true; - break; - } - } - } - // Restore the register from R12 - BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr)) - .addReg(Reg, RegState::Define) - .addReg(ARM::R12, RegState::Kill) - .add(predOps(ARMCC::AL)); - - return true; -} - void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { @@ -618,3 +561,14 @@ if (MI.isPredicable()) MIB.add(predOps(ARMCC::AL)); } + +bool +ThumbRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { + if (MF.getSubtarget().isThumb1Only()) { + // For Thumb1, the emergency spill slot must be some small positive + // offset from the base/stack pointer. + return false; + } + // For Thumb2, put the emergency spill slot next to FP. + return true; +} diff --git a/llvm/test/CodeGen/ARM/ldrex-frame-size.ll b/llvm/test/CodeGen/ARM/ldrex-frame-size.ll --- a/llvm/test/CodeGen/ARM/ldrex-frame-size.ll +++ b/llvm/test/CodeGen/ARM/ldrex-frame-size.ll @@ -11,9 +11,9 @@ define void @test_large_frame() { ; CHECK-LABEL: test_large_frame: ; CHECK: push -; CHECK: sub.w sp, sp, #1004 +; CHECK: sub.w sp, sp, #1008 - %ptr = alloca i32, i32 251 + %ptr = alloca i32, i32 252 %addr = getelementptr i32, i32* %ptr, i32 1 call i32 @llvm.arm.ldrex.p0i32(i32* %addr) @@ -24,9 +24,9 @@ define void @test_small_frame() { ; CHECK-LABEL: test_small_frame: ; CHECK-NOT: push -; CHECK: sub.w sp, sp, #1000 +; CHECK: sub.w sp, sp, #1004 - %ptr = alloca i32, i32 250 + %ptr = alloca i32, i32 251 %addr = getelementptr i32, i32* %ptr, i32 1 call i32 @llvm.arm.ldrex.p0i32(i32* %addr) diff --git a/llvm/test/CodeGen/ARM/scavenging.mir b/llvm/test/CodeGen/ARM/scavenging.mir deleted file mode 100644 --- a/llvm/test/CodeGen/ARM/scavenging.mir +++ /dev/null @@ -1,66 +0,0 @@ -# RUN: llc -o - %s -mtriple=thumb-arm-none-eabi -mcpu=cortex-m0 -run-pass scavenger-test | FileCheck %s ---- -# CHECK-LABEL: name: scavengebug0 -# Make sure we are not spilling/using a physreg used in the very last -# instruction of the scavenging range. -# CHECK-NOT: tSTRi {{.*}}$r0,{{.*}}$r0 -# CHECK-NOT: tSTRi {{.*}}$r1,{{.*}}$r1 -# CHECK-NOT: tSTRi {{.*}}$r2,{{.*}}$r2 -# CHECK-NOT: tSTRi {{.*}}$r3,{{.*}}$r3 -# CHECK-NOT: tSTRi {{.*}}$r4,{{.*}}$r4 -# CHECK-NOT: tSTRi {{.*}}$r5,{{.*}}$r5 -# CHECK-NOT: tSTRi {{.*}}$r6,{{.*}}$r6 -# CHECK-NOT: tSTRi {{.*}}$r7,{{.*}}$r7 -name: scavengebug0 -body: | - bb.0: - ; Bring up register pressure to force emergency spilling - $r0 = IMPLICIT_DEF - $r1 = IMPLICIT_DEF - $r2 = IMPLICIT_DEF - $r3 = IMPLICIT_DEF - $r4 = IMPLICIT_DEF - $r5 = IMPLICIT_DEF - $r6 = IMPLICIT_DEF - $r7 = IMPLICIT_DEF - - %0 : tgpr = IMPLICIT_DEF - %0 = tADDhirr %0, $sp, 14, $noreg - tSTRi $r0, %0, 0, 14, $noreg - - %1 : tgpr = IMPLICIT_DEF - %1 = tADDhirr %1, $sp, 14, $noreg - tSTRi $r1, %1, 0, 14, $noreg - - %2 : tgpr = IMPLICIT_DEF - %2 = tADDhirr %2, $sp, 14, $noreg - tSTRi $r2, %2, 0, 14, $noreg - - %3 : tgpr = IMPLICIT_DEF - %3 = tADDhirr %3, $sp, 14, $noreg - tSTRi $r3, %3, 0, 14, $noreg - - %4 : tgpr = IMPLICIT_DEF - %4 = tADDhirr %4, $sp, 14, $noreg - tSTRi $r4, %4, 0, 14, $noreg - - %5 : tgpr = IMPLICIT_DEF - %5 = tADDhirr %5, $sp, 14, $noreg - tSTRi $r5, %5, 0, 14, $noreg - - %6 : tgpr = IMPLICIT_DEF - %6 = tADDhirr %6, $sp, 14, $noreg - tSTRi $r6, %6, 0, 14, $noreg - - %7 : tgpr = IMPLICIT_DEF - %7 = tADDhirr %7, $sp, 14, $noreg - tSTRi $r7, %7, 0, 14, $noreg - - KILL $r0 - KILL $r1 - KILL $r2 - KILL $r3 - KILL $r4 - KILL $r5 - KILL $r6 - KILL $r7 diff --git a/llvm/test/CodeGen/ARM/thumb1-varalloc.ll b/llvm/test/CodeGen/ARM/thumb1-varalloc.ll --- a/llvm/test/CodeGen/ARM/thumb1-varalloc.ll +++ b/llvm/test/CodeGen/ARM/thumb1-varalloc.ll @@ -34,9 +34,10 @@ bb3: %.0 = phi i8* [ %0, %entry ], [ %6, %bb2 ], [ %3, %bb1 ] -; CHECK: subs r4, #5 +; CHECK: subs r4, r7, #7 +; CHECK-NEXT: subs r4, #1 ; CHECK-NEXT: mov sp, r4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, r6, r7, pc} ret i8* %.0 } diff --git a/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll b/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll @@ -0,0 +1,380 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s +target triple = "thumbv6m-unknown-unknown-eabi" + +define void @vla_emergency_spill(i32 %n) { +; CHECK-LABEL: vla_emergency_spill: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .setfp r7, sp, #12 +; CHECK-NEXT: add r7, sp, #12 +; CHECK-NEXT: .pad #4100 +; CHECK-NEXT: ldr r6, .LCPI0_0 +; CHECK-NEXT: add sp, r6 +; CHECK-NEXT: mov r6, sp +; CHECK-NEXT: adds r0, r0, #7 +; CHECK-NEXT: movs r1, #7 +; CHECK-NEXT: bics r0, r1 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: mov sp, r0 +; CHECK-NEXT: adds r1, r6, #4 +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: str r0, [r6] +; CHECK-NEXT: ldr r0, .LCPI0_1 +; CHECK-NEXT: str r5, [r0, r6] +; CHECK-NEXT: ldr r0, [r6] +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: subs r4, r7, #7 +; CHECK-NEXT: subs r4, #5 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 4294963196 @ 0xffffeffc +; CHECK-NEXT: .LCPI0_1: +; CHECK-NEXT: .long 1024 @ 0x400 +entry: + %x = alloca [1024 x i32], align 4 + %vla = alloca i8, i32 %n, align 1 + %asm1 = call { i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},0,1,2,3,4,5"(i8* %vla, [1024 x i32]* %x, i32 undef, i32 undef, i32 undef, i32 undef) + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32 } %asm1, 0 + %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32 } %asm1, 1 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32 } %asm1, 2 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32 } %asm1, 3 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32 } %asm1, 4 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32 } %asm1, 5 + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* %x, i32 0, i32 255 + store i32 %asmresult5, i32* %arrayidx, align 4 + call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5}"(i32 %asmresult, i32 %asmresult1, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5) #2 + ret void +} + +define void @simple_emergency_spill(i32 %n) { +; CHECK-LABEL: simple_emergency_spill: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #8196 +; CHECK-NEXT: ldr r7, .LCPI1_0 +; CHECK-NEXT: add sp, r7 +; CHECK-NEXT: add r0, sp, #4 +; CHECK-NEXT: ldr r1, .LCPI1_2 +; CHECK-NEXT: add r1, sp +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: ldr r0, .LCPI1_3 +; CHECK-NEXT: add r0, sp +; CHECK-NEXT: str r5, [r0] +; CHECK-NEXT: ldr r0, [sp] +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: ldr r7, .LCPI1_1 +; CHECK-NEXT: add sp, r7 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .long 4294959100 @ 0xffffdffc +; CHECK-NEXT: .LCPI1_1: +; CHECK-NEXT: .long 8196 @ 0x2004 +; CHECK-NEXT: .LCPI1_2: +; CHECK-NEXT: .long 4100 @ 0x1004 +; CHECK-NEXT: .LCPI1_3: +; CHECK-NEXT: .long 5120 @ 0x1400 +entry: + %x = alloca [1024 x i32], align 4 + %y = alloca [1024 x i32], align 4 + %asm1 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},={r7},0,1,2,3,4,5,6,7"([1024 x i32]* %y, [1024 x i32]* %x, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef) + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 0 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 1 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 2 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 3 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 4 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 5 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 6 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 7 + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* %x, i32 0, i32 255 + store i32 %asmresult6, i32* %arrayidx, align 4 + call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6},{r7}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7, i32 %asmresult8) + ret void +} + +; We have some logic to try to spill registers instead of allocating an +; emergency spill slot, but for targets where the stack alignment is 8, +; it only triggers when there are two available registers. (This is +; maybe worth looking into, to improve the generated code quality.) +; +; The scavenger itself only cares whether a register is allocatable, not +; whether it was actually spilled in the prologue, and r7 is first on +; the priority list, so we use it anyway. This is likely to confuse +; debuggers, so maybe worth changing at some point. +define void @simple_emergency_spill_nor7(i32 %n) { +; CHECK-LABEL: simple_emergency_spill_nor7: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .pad #8196 +; CHECK-NEXT: ldr r6, .LCPI2_0 +; CHECK-NEXT: add sp, r6 +; CHECK-NEXT: add r0, sp, #4 +; CHECK-NEXT: ldr r1, .LCPI2_2 +; CHECK-NEXT: add r1, sp +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: str r7, [sp] +; CHECK-NEXT: ldr r7, .LCPI2_3 +; CHECK-NEXT: add r7, sp +; CHECK-NEXT: str r5, [r7] +; CHECK-NEXT: ldr r7, [sp] +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: ldr r6, .LCPI2_1 +; CHECK-NEXT: add sp, r6 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI2_0: +; CHECK-NEXT: .long 4294959100 @ 0xffffdffc +; CHECK-NEXT: .LCPI2_1: +; CHECK-NEXT: .long 8196 @ 0x2004 +; CHECK-NEXT: .LCPI2_2: +; CHECK-NEXT: .long 4100 @ 0x1004 +; CHECK-NEXT: .LCPI2_3: +; CHECK-NEXT: .long 5120 @ 0x1400 +entry: + %x = alloca [1024 x i32], align 4 + %y = alloca [1024 x i32], align 4 + %asm1 = call { i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},0,1,2,3,4,5,6"([1024 x i32]* %y, [1024 x i32]* %x, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef) + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 0 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 1 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 2 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 3 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 4 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 5 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 6 + %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* %x, i32 0, i32 255 + store i32 %asmresult6, i32* %arrayidx, align 4 + call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7) + ret void +} + +define void @arg_emergency_spill(i32 %n, i32 %n2, i32 %n3, i32 %n4, [252 x i32]* byval %p) { +; CHECK-LABEL: arg_emergency_spill: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: add r0, sp, #24 +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: ldr r0, .LCPI3_0 +; CHECK-NEXT: add r0, sp +; CHECK-NEXT: str r5, [r0] +; CHECK-NEXT: ldr r0, [sp] +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI3_0: +; CHECK-NEXT: .long 1028 @ 0x404 +entry: + %pp = getelementptr inbounds [252 x i32], [252 x i32]* %p, i32 0, i32 0 + %asm1 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},={r7},0,1,2,3,4,5,6,7"(i32* %pp, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef) + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 0 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 1 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 2 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 3 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 4 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 5 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 6 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 7 + %arrayidx = getelementptr inbounds i32, i32* %pp, i32 251 + store i32 %asmresult6, i32* %arrayidx, align 4 + call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6},{r7}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7, i32 %asmresult8) + ret void +} + +; We currently overestimate the amount of required stack space by 16 bytes, +; so this is the largest stack that doesn't require an emergency spill slot. +define void @arg_no_emergency_spill(i32 %n, i32 %n2, i32 %n3, i32 %n4, [248 x i32]* byval %p) { +; CHECK-LABEL: arg_no_emergency_spill: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: add r0, sp, #20 +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: str r5, [sp, #1008] +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +entry: + %pp = getelementptr inbounds [248 x i32], [248 x i32]* %p, i32 0, i32 0 + %asm1 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},={r7},0,1,2,3,4,5,6,7"(i32* %pp, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef) + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 0 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 1 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 2 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 3 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 4 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 5 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 6 + %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 7 + %arrayidx = getelementptr inbounds i32, i32* %pp, i32 247 + store i32 %asmresult6, i32* %arrayidx, align 4 + call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6},{r7}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7, i32 %asmresult8) + ret void +} + +define void @aligned_emergency_spill(i32 %n, i32 %n2, i32 %n3, i32 %n4, [31 x i32]* byval %p) { +; CHECK-LABEL: aligned_emergency_spill: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .setfp r7, sp, #12 +; CHECK-NEXT: add r7, sp, #12 +; CHECK-NEXT: .pad #44 +; CHECK-NEXT: sub sp, #44 +; CHECK-NEXT: mov r4, sp +; CHECK-NEXT: lsrs r4, r4, #4 +; CHECK-NEXT: lsls r4, r4, #4 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: adds r1, r7, #7 +; CHECK-NEXT: adds r1, #1 +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: str r0, [sp, #12] +; CHECK-NEXT: ldr r0, .LCPI5_0 +; CHECK-NEXT: str r5, [r0, r7] +; CHECK-NEXT: ldr r0, [sp, #12] +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: subs r4, r7, #7 +; CHECK-NEXT: subs r4, #5 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI5_0: +; CHECK-NEXT: .long 128 @ 0x80 +entry: + %y = alloca [4 x i32], align 16 + %pp = getelementptr inbounds [31 x i32], [31 x i32]* %p, i32 0, i32 0 + %asm1 = call { i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},0,1,2,3,4,5,6"([4 x i32]* %y, i32* %pp, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef) #3 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 0 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 1 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 2 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 3 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 4 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 5 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 6 + %arrayidx = getelementptr inbounds i32, i32* %pp, i32 30 + store i32 %asmresult6, i32* %arrayidx, align 4 + call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7) + ret void +} + +; This function should have no emergency spill slot, so its stack should be +; smaller than @aligned_emergency_spill. +define void @aligned_no_emergency_spill(i32 %n, i32 %n2, i32 %n3, i32 %n4, [30 x i32]* byval %p) { +; CHECK-LABEL: aligned_no_emergency_spill: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .setfp r7, sp, #12 +; CHECK-NEXT: add r7, sp, #12 +; CHECK-NEXT: .pad #28 +; CHECK-NEXT: sub sp, #28 +; CHECK-NEXT: mov r4, sp +; CHECK-NEXT: lsrs r4, r4, #4 +; CHECK-NEXT: lsls r4, r4, #4 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: adds r1, r7, #7 +; CHECK-NEXT: adds r1, #1 +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: str r5, [r7, #124] +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: subs r4, r7, #7 +; CHECK-NEXT: subs r4, #5 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +entry: + %y = alloca [4 x i32], align 16 + %pp = getelementptr inbounds [30 x i32], [30 x i32]* %p, i32 0, i32 0 + %asm1 = call { i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},0,1,2,3,4,5,6"([4 x i32]* %y, i32* %pp, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef) #3 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 0 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 1 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 2 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 3 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 4 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 5 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 6 + %arrayidx = getelementptr inbounds i32, i32* %pp, i32 29 + store i32 %asmresult6, i32* %arrayidx, align 4 + call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7) + ret void +} + +; This function shouldn't fail to compile. (It's UB, so it doesn't really +; matter what it compiles to, exactly, but we need to check at some point +; so we don't generate code that requires an emergency spill slot we never +; allocated. If the store gets eliminated, this testcase probably needs +; to be rewritten.) +define void @aligned_out_of_range_access(i32 %n, i32 %n2, i32 %n3, i32 %n4, [30 x i32]* byval %p) { +; CHECK-LABEL: aligned_out_of_range_access: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .setfp r7, sp, #12 +; CHECK-NEXT: add r7, sp, #12 +; CHECK-NEXT: .pad #44 +; CHECK-NEXT: sub sp, #44 +; CHECK-NEXT: mov r4, sp +; CHECK-NEXT: lsrs r4, r4, #4 +; CHECK-NEXT: lsls r4, r4, #4 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: adds r1, r7, #7 +; CHECK-NEXT: adds r1, #1 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: str r5, [r0, #120] +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: @APP +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: subs r4, r7, #7 +; CHECK-NEXT: subs r4, #5 +; CHECK-NEXT: mov sp, r4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +entry: + %y = alloca [4 x i32], align 16 + %pp = getelementptr inbounds [30 x i32], [30 x i32]* %p, i32 0, i32 0 + %asm1 = call { i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},0,1,2,3,4,5,6"([4 x i32]* %y, i32* %pp, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef) #3 + %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 0 + %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 1 + %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 2 + %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 3 + %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 4 + %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 5 + %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32 } %asm1, 6 + %arrayidx = getelementptr inbounds i32, i32* %pp, i32 30 + store i32 %asmresult6, i32* %arrayidx, align 4 + call void asm sideeffect "", "{r0},{r1},{r2},{r3},{r4},{r5},{r6}"(i32 %asmresult, i32 %asmresult2, i32 %asmresult3, i32 %asmresult4, i32 %asmresult5, i32 %asmresult6, i32 %asmresult7) + ret void +} diff --git a/llvm/test/CodeGen/Thumb/frame-access.ll b/llvm/test/CodeGen/Thumb/frame-access.ll --- a/llvm/test/CodeGen/Thumb/frame-access.ll +++ b/llvm/test/CodeGen/Thumb/frame-access.ll @@ -124,7 +124,7 @@ ; CHECK-NEXT: lsls r4, r4, #4 ; CHECK-NEXT: mov sp, r4 ; Incoming register varargs stored via FP -; CHECK: mov r0, r7 +; CHECK: mov r0, r7 ; CHECK-NEXT: adds r0, #8 ; CHECK-NEXT: stm r0!, {r1, r2, r3} ; VLAs present, access via FP @@ -199,11 +199,13 @@ ; CHECK: push {r4, r5, r6, r7, lr} ; 20 bytes locals ; CHECK: sub sp, #20 +; Setup base pointer +; CHECK: mov r6, sp ; Allocate outgoing arguments space ; CHECK: sub sp, #508 ; CHECK: sub sp, #4 -; Load `e` via SP, 552 = 512 + 20 + 20 -; CHECK: ldr r3, [sp, #552] +; Load `e` via BP, 40 = 20 + 20 +; CHECK: ldr r3, [r6, #40] ; CHECK: bl f ; Stack restored before next call ; CHECK-NEXT: add sp, #508 @@ -235,11 +237,12 @@ ; Three incoming register varargs ; CHECK: sub sp, #12 ; 16 bytes callee-saves -; CHECK: push {r4, r5, r7, lr} +; CHECK: push {r4, r5, r6, lr} ; 20 bytes locals ; CHECK: sub sp, #20 -; Incoming varargs stored via SP, 36 = 20 + 16 -; CHECK: add r0, sp, #36 +; Incoming varargs stored via BP, 36 = 20 + 16 +; CHECK: mov r0, r6 +; CHECK-NEXT: adds r0, #36 ; CHECK-NEXT: stm r0!, {r1, r2, r3} ; @@ -394,17 +397,19 @@ ; CHECK-LABEL: test_local_moving_sp ; Locals area ; CHECK: sub sp, #36 +; Setup BP +; CHECK: mov r6, sp ; Outoging arguments ; CHECK: sub sp, #508 ; CHECK-NEXT: sub sp, #508 ; CHECK-NEXT: sub sp, #8 -; Argument addresses computed relative to SP -; CHECK: add r4, sp, #1020 -; CHECK-NEXT: adds r4, #24 -; CHECK: add r1, sp, #1020 -; CHECK-NEXT: adds r1, #20 -; CHECK: add r5, sp, #1020 -; CHECK-NEXT: adds r5, #16 +; Argument addresses computed relative to BP +; CHECK: adds r0, r6, #7 +; CHECK-NEXT: adds r0, #13 +; CHECK: adds r1, r6, #7 +; CHECK-NEXT: adds r1, #9 +; CHECK: adds r5, r6, #7 +; CHECK-NEXT: adds r5, #5 ; CHECK: bl u ; Stack restored before next call ; CHECK: add sp, #508 diff --git a/llvm/test/CodeGen/Thumb/large-stack.ll b/llvm/test/CodeGen/Thumb/large-stack.ll --- a/llvm/test/CodeGen/Thumb/large-stack.ll +++ b/llvm/test/CodeGen/Thumb/large-stack.ll @@ -33,9 +33,8 @@ ; CHECK: sub sp, #508 ; CHECK: sub sp, #508 ; CHECK: sub sp, #508 -; ALIGN4: subs r4, r7, #4 -; ALIGN8: subs r4, r7, #7 -; ALIGN8: subs r4, #1 +; CHECK: subs r4, r7, #7 +; CHECK: subs r4, #1 ; CHECK: mov sp, r4 %tmp = alloca [ 1524 x i8 ] , align 4 ret void @@ -57,9 +56,8 @@ ; CHECK-LABEL: test2_nofpelim: ; CHECK: ldr [[TEMP:r[0-7]]], ; CHECK: add sp, [[TEMP]] -; ALIGN4: subs r4, r7, #4 -; ALIGN8: subs r4, r7, #7 -; ALIGN8: subs r4, #1 +; CHECK: subs r4, r7, #7 +; CHECK: subs r4, #1 ; CHECK: mov sp, r4 %tmp = alloca [ 1528 x i8 ] , align 4 ret void