Index: lib/Target/AArch64/AArch64FrameLowering.h =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.h +++ lib/Target/AArch64/AArch64FrameLowering.h @@ -41,8 +41,8 @@ int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; int resolveFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg, - bool PreferFP = false) const; + unsigned &FrameReg, bool PreferFP = false, + bool ForceSP = false) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, @@ -54,7 +54,7 @@ const TargetRegisterInfo *TRI) const override; /// \brief Can this function use the red zone for local allocations. - bool canUseRedZone(const MachineFunction &MF) const; + bool canUseRedZone(const MachineFunction &MF, unsigned StackSize) const; bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; Index: lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.cpp +++ lib/Target/AArch64/AArch64FrameLowering.cpp @@ -115,7 +115,8 @@ STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); -bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { +bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF, + unsigned StackSize) const { if (!EnableRedZone) return false; // Don't use the red zone if the function explicitly asks us not to. @@ -124,10 +125,8 @@ return false; const MachineFrameInfo *MFI = MF.getFrameInfo(); - const AArch64FunctionInfo *AFI = MF.getInfo(); - unsigned NumBytes = AFI->getLocalStackSize(); - return !(MFI->hasCalls() || hasFP(MF) || NumBytes > 128); + return !(MFI->hasCalls() || hasFP(MF) || StackSize > 128); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -316,7 +315,7 @@ return; // REDZONE: If the stack size is less than 128 bytes, we don't need // to actually allocate. - if (canUseRedZone(MF)) + if (canUseRedZone(MF, NumBytes)) ++NumRedZoneFunctions; else { emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, @@ -334,10 +333,18 @@ return; } - NumBytes -= AFI->getCalleeSavedStackSize(); - assert(NumBytes >= 0 && "Negative stack allocation size!?"); + auto CSStackSize = AFI->getCalleeSavedStackSize(); // All of the remaining stack allocations are for locals. - AFI->setLocalStackSize(NumBytes); + AFI->setLocalStackSize(NumBytes - CSStackSize); + bool CSRBumpsSP = AFI->getCalleeSaveRestoreUpdatesSP(); + if (CSRBumpsSP) { + NumBytes -= CSStackSize; + } else { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); + NumBytes = 0; + } + assert(NumBytes >= 0 && "Negative stack allocation size!?"); // Move past the saves of the callee-saved registers. MachineBasicBlock::iterator End = MBB.end(); @@ -346,6 +353,8 @@ if (HasFP) { // Only set up FP if we actually need to. Frame pointer is fp = sp - 16. int FPOffset = AFI->getCalleeSavedStackSize() - 16; + if (!CSRBumpsSP) + FPOffset += AFI->getLocalStackSize(); // Issue sub fp, sp, FPOffset or // mov fp,sp when FPOffset is zero. @@ -366,7 +375,7 @@ } // If we're a leaf function, try using the red zone. - if (!canUseRedZone(MF)) + if (!canUseRedZone(MF, NumBytes)) // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. @@ -569,6 +578,14 @@ // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps // it as the 2nd argument of AArch64ISD::TC_RETURN. + // If there is a single SP update, insert it before the ret and we're done. + if (!AFI->getCalleeSaveRestoreUpdatesSP()) { + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + NumBytes + ArgumentPopSize, TII, + MachineInstr::FrameDestroy); + return; + } + // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); MachineBasicBlock::iterator Begin = MBB.begin(); @@ -583,7 +600,7 @@ assert(NumBytes >= 0 && "Negative stack allocation size!?"); if (!hasFP(MF)) { - bool RedZone = canUseRedZone(MF); + bool RedZone = canUseRedZone(MF, NumBytes); // If this was a redzone leaf function, we don't need to restore the // stack pointer (but we may need to pop stack args for fastcc). if (RedZone && ArgumentPopSize == 0) @@ -634,7 +651,8 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg, - bool PreferFP) const { + bool PreferFP, + bool ForceSP) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const AArch64RegisterInfo *RegInfo = static_cast( MF.getSubtarget().getRegisterInfo()); @@ -666,12 +684,15 @@ // using the FP regardless, though, as the SP offset is unknown // and we don't have a base pointer available. If an offset is // available via the FP and the SP, use whichever is closest. - if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 || - (FPOffset >= -256 && Offset > -FPOffset)) + if (PreferFP || MFI->hasVarSizedObjects()) UseFP = true; + if (!ForceSP) + if (FPOffset >= 0 || (FPOffset >= -256 && Offset > -FPOffset)) + UseFP = true; } } + assert(!(UseFP && ForceSP) && "ForceSP flag could not be honored"); assert((isFixed || !RegInfo->needsStackRealignment(MF) || !UseFP) && "In the presence of dynamic stack pointer realignment, " "non-argument objects cannot be accessed through the frame pointer"); @@ -689,8 +710,9 @@ // If we're using the red zone for this function, the SP won't actually // be adjusted, so the offsets will be negative. They're also all // within range of the signed 9-bit immediate instructions. - if (canUseRedZone(MF)) - Offset -= AFI->getLocalStackSize(); + unsigned StackSize = AFI->getLocalStackSize(); + if (canUseRedZone(MF, StackSize)) + Offset -= StackSize; } return Offset; @@ -799,14 +821,6 @@ if (RPI.isPaired()) ++i; } - - // Align first offset to even 16-byte boundary to avoid additional SP - // adjustment instructions. - // Last pair offset is size of whole callee-save region for SP - // pre-dec/post-inc. - RegPairInfo &LastPair = RegPairs.back(); - assert(AFI->getCalleeSavedStackSize() % 8 == 0); - LastPair.Offset = AFI->getCalleeSavedStackSize() / 8; } bool AArch64FrameLowering::spillCalleeSavedRegisters( @@ -814,10 +828,12 @@ const std::vector &CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); + AArch64FunctionInfo *AFI = MF.getInfo(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); DebugLoc DL; SmallVector RegPairs; + bool CSRBumpsSP = AFI->getCalleeSaveRestoreUpdatesSP(); computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; @@ -836,7 +852,7 @@ // Rationale: This sequence saves uop updates compared to a sequence of // pre-increment spills like stp xi,xj,[sp,#-16]! // Note: Similar rationale and sequence for restores in epilog. - bool BumpSP = RPII == RegPairs.rbegin(); + bool BumpSP = CSRBumpsSP && RPII == RegPairs.rbegin(); if (RPI.IsGPR) { // For first spill use pre-increment store. if (BumpSP) @@ -858,29 +874,40 @@ dbgs() << ", " << RPI.FrameIdx+1; dbgs() << ")\n"); - const int Offset = BumpSP ? -RPI.Offset : RPI.Offset; + int Offset = RPI.Offset; MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); - if (BumpSP) + // Check and modify offset for pre-increment of SP. + if (BumpSP) { MIB.addReg(AArch64::SP, RegState::Define); + assert(Offset == 0); + assert(AFI->getCalleeSavedStackSize() % 8 == 0); + Offset = -(AFI->getCalleeSavedStackSize() / 8); + assert(Offset % 2 == 0 && + "Callee-save store SP bump not 16-byte aligned"); + } + int MIOffset; if (RPI.isPaired()) { MBB.addLiveIn(Reg1); MBB.addLiveIn(Reg2); MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)) - .addReg(Reg1, getPrologueDeath(MF, Reg1)) - .addReg(AArch64::SP) - .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit - .setMIFlag(MachineInstr::FrameSetup); + .addReg(Reg1, getPrologueDeath(MF, Reg1)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), MachineMemOperand::MOStore, 8, 8)); + MIOffset = Offset; // [sp, #offset * 8], where factor * 8 is implicit } else { MBB.addLiveIn(Reg1); - MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) - .addReg(AArch64::SP) - .addImm(BumpSP ? Offset * 8 : Offset) // pre-inc version is unscaled - .setMIFlag(MachineInstr::FrameSetup); + MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)); + MIOffset = BumpSP ? Offset * 8 : Offset; // pre-inc version is unscaled } + if (CSRBumpsSP) + MIB.addReg(AArch64::SP) + .addImm(MIOffset); + else + MIB.addFrameIndex(RPI.isPaired() ? RPI.FrameIdx + 1 : RPI.FrameIdx) + .addImm(0); + MIB.setMIFlag(MachineInstr::FrameSetup); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), MachineMemOperand::MOStore, 8, 8)); @@ -893,6 +920,7 @@ const std::vector &CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); + AArch64FunctionInfo *AFI = MF.getInfo(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); DebugLoc DL; SmallVector RegPairs; @@ -900,6 +928,7 @@ if (MI != MBB.end()) DL = MI->getDebugLoc(); + bool CSRBumpsSP = AFI->getCalleeSaveRestoreUpdatesSP(); computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE; @@ -916,7 +945,7 @@ // ldp x22, x21, [sp], #48 // addImm(+6) // Note: see comment in spillCalleeSavedRegisters() unsigned LdrOpc; - bool BumpSP = RPII == std::prev(RegPairs.end()); + bool BumpSP = CSRBumpsSP && RPII == std::prev(RegPairs.end()); if (RPI.IsGPR) { if (BumpSP) LdrOpc = RPI.isPaired() ? AArch64::LDPXpost : AArch64::LDRXpost; @@ -936,27 +965,38 @@ dbgs() << ", " << RPI.FrameIdx+1; dbgs() << ")\n"); - const int Offset = RPI.Offset; + int Offset = RPI.Offset; MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); - if (BumpSP) + // Check and modify offset for post-decrement of SP. + if (BumpSP) { MIB.addReg(AArch64::SP, RegState::Define); + assert(Offset == 0); + assert(AFI->getCalleeSavedStackSize() % 8 == 0); + Offset = AFI->getCalleeSavedStackSize() / 8; + assert(Offset % 2 == 0 && + "Callee-save restore SP bump not 16-byte aligned"); + } + int MIOffset; if (RPI.isPaired()) { MIB.addReg(Reg2, getDefRegState(true)) - .addReg(Reg1, getDefRegState(true)) - .addReg(AArch64::SP) - .addImm(Offset) // [sp], #offset * 8 or [sp, #offset * 8] - // where the factor * 8 is implicit - .setMIFlag(MachineInstr::FrameDestroy); + .addReg(Reg1, getDefRegState(true)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), MachineMemOperand::MOLoad, 8, 8)); + MIOffset = Offset; // [sp], #offset * 8 or [sp, #offset * 8] + // where the factor * 8 is implicit } else { - MIB.addReg(Reg1, getDefRegState(true)) - .addReg(AArch64::SP) - .addImm(BumpSP ? Offset * 8 : Offset) // post-dec version is unscaled - .setMIFlag(MachineInstr::FrameDestroy); + MIB.addReg(Reg1, getDefRegState(true)); + MIOffset = BumpSP ? Offset * 8 : Offset; // post-dec version is unscaled } + if (CSRBumpsSP) + MIB.addReg(AArch64::SP) + .addImm(MIOffset); + else + MIB.addFrameIndex(RPI.isPaired() ? RPI.FrameIdx + 1 : RPI.FrameIdx) + .addImm(0); + MIB.setMIFlag(MachineInstr::FrameDestroy); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), MachineMemOperand::MOLoad, 8, 8)); @@ -1073,7 +1113,27 @@ } } - // Round up to register pair alignment to avoid additional SP adjustment - // instructions. - AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16)); + // Check to see if we can combine the callee-save and local stack pointer + // adjustment into a single decrement/increment. + unsigned EstNonCSStackSize = MFI->estimateStackSize(MF); + unsigned StackSize = + alignTo(EstNonCSStackSize + 8 * NumRegsSpilled, getStackAlignment()); + if (EstNonCSStackSize != 0 && + // 512 is the maximum immediate for stp/ldp that will be used for + // callee-save save/restores + StackSize < 512 && + !MFI->hasVarSizedObjects() && !RegInfo->needsStackRealignment(MF) && + // This isn't strictly neccessary, but it simplifies things a bit since + // the current RedZone handling code assumes the SP is adjusted by the + // callee-save save/restore code. + !canUseRedZone(MF, StackSize)) + AFI->setCalleeSaveRestoreUpdatesSP(false); + else + // Round up to register pair alignment to avoid additional SP adjustment + // instructions. In the combined case we don't need to do this since the + // combined CS and local stack pointer adjustment will be 16-byte aligned, + // and not doing so here allows us to use less stack in some cases. + NumRegsSpilled = alignTo(NumRegsSpilled, 2); + + AFI->setCalleeSavedStackSize(8 * NumRegsSpilled); } Index: lib/Target/AArch64/AArch64MachineFunctionInfo.h =================================================================== --- lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -83,18 +83,27 @@ /// frame is unknown at compile time. e.g., in case of VLAs. bool StackRealigned; + /// True when the first/last callee-save save/restore store/load instruction + /// should decrement/increment the stack pointer. When this is false, there + /// is a single decrement/increment of the stack pointer as the first/last + /// non-terminator instruction of the function that allocates/de-allocates + /// both the callee-save area and the local area of the stack. + bool CalleeSaveRestoreUpdatesSP; + public: AArch64FunctionInfo() : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), - IsSplitCSR(false), StackRealigned(false) {} + IsSplitCSR(false), StackRealigned(false), + CalleeSaveRestoreUpdatesSP(true) {} explicit AArch64FunctionInfo(MachineFunction &MF) : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), - IsSplitCSR(false), StackRealigned(false) { + IsSplitCSR(false), StackRealigned(false), + CalleeSaveRestoreUpdatesSP(true) { (void)MF; } @@ -121,6 +130,13 @@ void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; } unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; } + void setCalleeSaveRestoreUpdatesSP(bool CSRUpdatesSP) { + CalleeSaveRestoreUpdatesSP = CSRUpdatesSP; + } + bool getCalleeSaveRestoreUpdatesSP() const { + return CalleeSaveRestoreUpdatesSP; + } + void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; } unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamicTLSAccesses; Index: lib/Target/AArch64/AArch64RegisterInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64RegisterInfo.cpp +++ lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -388,10 +388,16 @@ } // Modify MI as necessary to handle as much of 'Offset' as possible - Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg); + bool ForceSP = MI.getFlag(MachineInstr::FrameSetup) || + MI.getFlag(MachineInstr::FrameDestroy); + Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, + /*PreferFP=*/false, ForceSP); if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII)) return; + assert(!MI.getFlag(MachineInstr::FrameSetup) && + !MI.getFlag(MachineInstr::FrameDestroy) && + "Frame setup/destroy can't use emergency spill slot"); assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) && "Emergency spill slot is out of reach"); Index: test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll =================================================================== --- test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll +++ test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll @@ -98,8 +98,8 @@ ; CHECK-LABEL: novla_nodynamicrealign_call ; CHECK: .cfi_startproc ; Check that used callee-saved registers are saved -; CHECK: stp x19, x30, [sp, #-16]! -; CHECK: sub sp, sp, #16 +; CHECK: sub sp, sp, #32 +; CHECK: stp x19, x30, [sp, #16] ; Check correctness of cfi pseudo-instructions ; CHECK: .cfi_def_cfa_offset 32 ; CHECK: .cfi_offset w30, -8 @@ -110,17 +110,19 @@ ; Check correct access to local variable on the stack, through stack pointer ; CHECK: ldr w[[ILOC:[0-9]+]], [sp, #12] ; Check epilogue: -; CHECK: ldp x19, x30, [sp], #16 +; CHECK: ldp x19, x30, [sp, #16] +; CHECK: add sp, sp, #32 ; CHECK: ret ; CHECK: .cfi_endproc ; CHECK-MACHO-LABEL: _novla_nodynamicrealign_call: ; CHECK-MACHO: .cfi_startproc ; Check that used callee-saved registers are saved -; CHECK-MACHO: stp x20, x19, [sp, #-32]! +; CHECK-MACHO: sub sp, sp, #48 +; CHECK-MACHO: stp x20, x19, [sp, #16] ; Check that the frame pointer is created: -; CHECK-MACHO: stp x29, x30, [sp, #16] -; CHECK-MACHO: add x29, sp, #16 +; CHECK-MACHO: stp x29, x30, [sp, #32] +; CHECK-MACHO: add x29, sp, #32 ; Check correctness of cfi pseudo-instructions ; CHECK-MACHO: .cfi_def_cfa w29, 16 ; CHECK-MACHO: .cfi_offset w30, -8 @@ -133,8 +135,9 @@ ; Check correct access to local variable on the stack, through stack pointer ; CHECK-MACHO: ldr w[[ILOC:[0-9]+]], [sp, #12] ; Check epilogue: -; CHECK-MACHO: ldp x29, x30, [sp, #16] -; CHECK-MACHO: ldp x20, x19, [sp], #32 +; CHECK-MACHO: ldp x29, x30, [sp, #32] +; CHECK-MACHO: ldp x20, x19, [sp, #16] +; CHECK-MACHO: add sp, sp, #48 ; CHECK-MACHO: ret ; CHECK-MACHO: .cfi_endproc Index: test/CodeGen/AArch64/arm64-aapcs-be.ll =================================================================== --- test/CodeGen/AArch64/arm64-aapcs-be.ll +++ test/CodeGen/AArch64/arm64-aapcs-be.ll @@ -32,7 +32,7 @@ define void @test_block_addr_callee() { ; CHECK-LABEL: test_block_addr_callee: -; CHECK: str {{[a-z0-9]+}}, [sp, #-16]! +; CHECK: str {{[a-z0-9]+}}, [sp] ; CHECK: bl test_block_addr %val = insertvalue [1 x float] undef, float 0.0, 0 call float @test_block_addr([8 x float] undef, [1 x float] %val) Index: test/CodeGen/AArch64/arm64-abi.ll =================================================================== --- test/CodeGen/AArch64/arm64-abi.ll +++ test/CodeGen/AArch64/arm64-abi.ll @@ -130,7 +130,7 @@ ; CHECK-LABEL: test3 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8] ; FAST-LABEL: test3 -; FAST: sub sp, sp, #32 +; FAST: sub sp, sp, #48 ; FAST: mov x[[ADDR:[0-9]+]], sp ; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8] %0 = load <2 x i32>, <2 x i32>* %in, align 8 Index: test/CodeGen/AArch64/arm64-abi_align.ll =================================================================== --- test/CodeGen/AArch64/arm64-abi_align.ll +++ test/CodeGen/AArch64/arm64-abi_align.ll @@ -291,7 +291,7 @@ ; Space for s2 is allocated at sp ; FAST-LABEL: caller42 -; FAST: sub sp, sp, #96 +; FAST: sub sp, sp, #112 ; Space for s1 is allocated at fp-24 = sp+72 ; Space for s2 is allocated at sp+48 ; FAST: sub x[[A:[0-9]+]], x29, #24 @@ -317,8 +317,8 @@ define i32 @caller42_stack() #3 { entry: ; CHECK-LABEL: caller42_stack -; CHECK: mov x29, sp -; CHECK: sub sp, sp, #96 +; CHECK: sub sp, sp, #112 +; CHECK: add x29, sp, #96 ; CHECK: stur {{x[0-9]+}}, [x29, #-16] ; CHECK: stur {{q[0-9]+}}, [x29, #-32] ; CHECK: str {{x[0-9]+}}, [sp, #48] @@ -399,7 +399,7 @@ ; Space for s2 is allocated at sp ; FAST-LABEL: caller43 -; FAST: mov x29, sp +; FAST: add x29, sp, #64 ; Space for s1 is allocated at sp+32 ; Space for s2 is allocated at sp ; FAST: add x1, sp, #32 @@ -429,8 +429,8 @@ define i32 @caller43_stack() #3 { entry: ; CHECK-LABEL: caller43_stack -; CHECK: mov x29, sp -; CHECK: sub sp, sp, #96 +; CHECK: sub sp, sp, #112 +; CHECK: add x29, sp, #96 ; CHECK: stur {{q[0-9]+}}, [x29, #-16] ; CHECK: stur {{q[0-9]+}}, [x29, #-32] ; CHECK: str {{q[0-9]+}}, [sp, #48] @@ -446,7 +446,7 @@ ; CHECK: str w[[C]], [sp] ; FAST-LABEL: caller43_stack -; FAST: sub sp, sp, #96 +; FAST: sub sp, sp, #112 ; Space for s1 is allocated at fp-32 = sp+64 ; Space for s2 is allocated at sp+32 ; FAST: sub x[[A:[0-9]+]], x29, #32 @@ -508,7 +508,7 @@ ; "i64 %0" should be in register x7. ; "i32 8" should be on stack at [sp]. ; CHECK: ldr x7, [{{x[0-9]+}}] -; CHECK: str {{w[0-9]+}}, [sp, #-16]! +; CHECK: str {{w[0-9]+}}, [sp] ; FAST-LABEL: i64_split ; FAST: ldr x7, [{{x[0-9]+}}] ; FAST: mov x[[R0:[0-9]+]], sp Index: test/CodeGen/AArch64/arm64-fast-isel-alloca.ll =================================================================== --- test/CodeGen/AArch64/arm64-fast-isel-alloca.ll +++ test/CodeGen/AArch64/arm64-fast-isel-alloca.ll @@ -14,7 +14,7 @@ define void @main() nounwind { entry: ; CHECK: main -; CHECK: mov x29, sp +; CHECK: add x29, sp, #16 ; CHECK: mov [[REG:x[0-9]+]], sp ; CHECK-NEXT: add x0, [[REG]], #8 %E = alloca %struct.S2Ty, align 4 Index: test/CodeGen/AArch64/arm64-hello.ll =================================================================== --- test/CodeGen/AArch64/arm64-hello.ll +++ test/CodeGen/AArch64/arm64-hello.ll @@ -2,26 +2,26 @@ ; RUN: llc < %s -mtriple=arm64-linux-gnu -disable-post-ra | FileCheck %s --check-prefix=CHECK-LINUX ; CHECK-LABEL: main: -; CHECK: stp x29, x30, [sp, #-16]! -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 +; CHECK: sub sp, sp, #32 +; CHECK-NEXT: stp x29, x30, [sp, #16] +; CHECK-NEXT: add x29, sp, #16 ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK: adrp x0, L_.str@PAGE ; CHECK: add x0, x0, L_.str@PAGEOFF ; CHECK-NEXT: bl _puts -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ldp x29, x30, [sp, #16] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; CHECK-LINUX-LABEL: main: -; CHECK-LINUX: str x30, [sp, #-16]! -; CHECK-LINUX-NEXT: sub sp, sp, #16 -; CHECK-LINUX-NEXT: str wzr, [sp, #12] +; CHECK-LINUX: sub sp, sp, #16 +; CHECK-LINUX-NEXT: str x30, [sp, #8] +; CHECK-LINUX-NEXT: str wzr, [sp, #4] ; CHECK-LINUX: adrp x0, .L.str ; CHECK-LINUX: add x0, x0, :lo12:.L.str ; CHECK-LINUX-NEXT: bl puts +; CHECK-LINUX-NEXT: ldr x30, [sp, #8] ; CHECK-LINUX-NEXT: add sp, sp, #16 -; CHECK-LINUX-NEXT: ldr x30, [sp], #16 ; CHECK-LINUX-NEXT: ret @.str = private unnamed_addr constant [7 x i8] c"hello\0A\00" Index: test/CodeGen/AArch64/arm64-join-reserved.ll =================================================================== --- test/CodeGen/AArch64/arm64-join-reserved.ll +++ test/CodeGen/AArch64/arm64-join-reserved.ll @@ -5,7 +5,7 @@ ; A move isn't necessary. ; ; CHECK-LABEL: g: -; CHECK: str xzr, [sp, #-16]! +; CHECK: str xzr, [sp] ; CHECK: bl ; CHECK: ret define void @g() nounwind ssp { Index: test/CodeGen/AArch64/arm64-memset-inline.ll =================================================================== --- test/CodeGen/AArch64/arm64-memset-inline.ll +++ test/CodeGen/AArch64/arm64-memset-inline.ll @@ -12,9 +12,9 @@ define void @t2() nounwind ssp { entry: ; CHECK-LABEL: t2: -; CHECK: strh wzr, [sp, #32] -; CHECK: stp xzr, xzr, [sp, #16] -; CHECK: str xzr, [sp, #8] +; CHECK: strh wzr, [sp, #24] +; CHECK: stp xzr, xzr, [sp, #8] +; CHECK: str xzr, [sp] %buf = alloca [26 x i8], align 1 %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0 call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false) Index: test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll =================================================================== --- test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll +++ test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll @@ -7,7 +7,7 @@ entry: ; CHECK-LABEL: jscall_patchpoint_codegen: ; CHECK: Ltmp -; CHECK: str x{{.+}}, [sp, #-16]! +; CHECK: str x{{.+}}, [sp] ; CHECK-NEXT: mov x0, x{{.+}} ; CHECK: Ltmp ; CHECK-NEXT: movz x16, #0xffff, lsl #32 @@ -16,7 +16,7 @@ ; CHECK-NEXT: blr x16 ; FAST-LABEL: jscall_patchpoint_codegen: ; FAST: Ltmp -; FAST: str x{{.+}}, [sp, #-16]! +; FAST: str x{{.+}}, [sp] ; FAST: Ltmp ; FAST-NEXT: movz x16, #0xffff, lsl #32 ; FAST-NEXT: movk x16, #0xdead, lsl #16 @@ -50,7 +50,7 @@ ; FAST: orr [[REG1:x[0-9]+]], xzr, #0x2 ; FAST-NEXT: orr [[REG2:w[0-9]+]], wzr, #0x4 ; FAST-NEXT: orr [[REG3:x[0-9]+]], xzr, #0x6 -; FAST-NEXT: str [[REG1]], [sp, #-32]! +; FAST-NEXT: str [[REG1]], [sp] ; FAST-NEXT: str [[REG2]], [sp, #16] ; FAST-NEXT: str [[REG3]], [sp, #24] ; FAST: Ltmp @@ -90,7 +90,7 @@ ; FAST-NEXT: orr [[REG3:x[0-9]+]], xzr, #0x6 ; FAST-NEXT: orr [[REG4:w[0-9]+]], wzr, #0x8 ; FAST-NEXT: movz [[REG5:x[0-9]+]], #0xa -; FAST-NEXT: str [[REG1]], [sp, #-64]! +; FAST-NEXT: str [[REG1]], [sp] ; FAST-NEXT: str [[REG2]], [sp, #16] ; FAST-NEXT: str [[REG3]], [sp, #24] ; FAST-NEXT: str [[REG4]], [sp, #36] Index: test/CodeGen/AArch64/arm64-patchpoint.ll =================================================================== --- test/CodeGen/AArch64/arm64-patchpoint.ll +++ test/CodeGen/AArch64/arm64-patchpoint.ll @@ -26,10 +26,11 @@ ; as a leaf function. ; ; CHECK-LABEL: caller_meta_leaf -; CHECK: mov x29, sp -; CHECK-NEXT: sub sp, sp, #32 +; CHECK: sub sp, sp, #48 +; CHECK-NEXT: stp x29, x30, [sp, #32] +; CHECK-NEXT: add x29, sp, #32 ; CHECK: Ltmp -; CHECK: add sp, sp, #32 +; CHECK: add sp, sp, #48 ; CHECK: ret define void @caller_meta_leaf() { Index: test/CodeGen/AArch64/arm64-shrink-wrapping.ll =================================================================== --- test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -13,9 +13,9 @@ ; ENABLE-NEXT: b.ge [[EXIT_LABEL:LBB[0-9_]+]] ; ; Prologue code. -; CHECK: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #-16]! -; CHECK-NEXT: mov [[SAVE_SP]], sp -; CHECK-NEXT: sub sp, sp, #16 +; CHECK: sub sp, sp, #32 +; CHECK-NEXT: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #16] +; CHECK-NEXT: add [[SAVE_SP]], sp, #16 ; ; Compare the arguments and jump to exit. ; After the prologue is set. @@ -33,8 +33,8 @@ ; Without shrink-wrapping, epilogue is in the exit block. ; DISABLE: [[EXIT_LABEL]]: ; Epilogue code. -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x{{[0-9]+}}, [[CSR]], [sp], #16 +; CHECK-NEXT: ldp x{{[0-9]+}}, [[CSR]], [sp, #16] +; CHECK-NEXT: add sp, sp, #32 ; ; With shrink-wrapping, exit block is a simple return. ; ENABLE: [[EXIT_LABEL]]: @@ -454,9 +454,9 @@ ; ENABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]] ; ; Prologue code. -; CHECK: stp [[CSR1:x[0-9]+]], [[CSR2:x[0-9]+]], [sp, #-16]! -; CHECK-NEXT: mov [[NEW_SP:x[0-9]+]], sp -; CHECK-NEXT: sub sp, sp, #48 +; CHECK: sub sp, sp, #64 +; CHECK-NEXT: stp [[CSR1:x[0-9]+]], [[CSR2:x[0-9]+]], [sp, #48] +; CHECK-NEXT: add [[NEW_SP:x[0-9]+]], sp, #48 ; ; DISABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]] ; Setup of the varags. @@ -473,8 +473,8 @@ ; DISABLE: [[IFEND_LABEL]]: ; %if.end ; ; Epilogue code. -; CHECK: add sp, sp, #48 -; CHECK-NEXT: ldp [[CSR1]], [[CSR2]], [sp], #16 +; CHECK: ldp [[CSR1]], [[CSR2]], [sp, #48] +; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret ; ; ENABLE: [[ELSE_LABEL]]: ; %if.else Index: test/CodeGen/AArch64/arm64-virtual_base.ll =================================================================== --- test/CodeGen/AArch64/arm64-virtual_base.ll +++ test/CodeGen/AArch64/arm64-virtual_base.ll @@ -34,9 +34,9 @@ define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) { ; CHECK: Precompute_Patch_Values ; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288] -; CHECK-NEXT: str [[VAL]], [sp, #232] +; CHECK-NEXT: str [[VAL]], [sp, #240] ; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272] -; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216] +; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #224] entry: %Control_Points = alloca [16 x [3 x double]], align 8 %arraydecay5.3.1 = getelementptr inbounds [16 x [3 x double]], [16 x [3 x double]]* %Control_Points, i64 0, i64 9, i64 0 Index: test/CodeGen/AArch64/fastcc.ll =================================================================== --- test/CodeGen/AArch64/fastcc.ll +++ test/CodeGen/AArch64/fastcc.ll @@ -7,13 +7,16 @@ define fastcc void @func_stack0() { ; CHECK-LABEL: func_stack0: -; CHECK: mov x29, sp -; CHECK: str w{{[0-9]+}}, [sp, #-32]! +; CHECK: sub sp, sp, #48 +; CHECK-NEXT: stp x29, x30, [sp, #32] +; CHECK-NEXT: add x29, sp, #32 +; CHECK: str w{{[0-9]+}}, [sp] ; CHECK-TAIL-LABEL: func_stack0: -; CHECK-TAIL: stp x29, x30, [sp, #-16]! -; CHECK-TAIL-NEXT: mov x29, sp -; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]! +; CHECK-TAIL: sub sp, sp, #48 +; CHECK-TAIL-NEXT: stp x29, x30, [sp, #32] +; CHECK-TAIL-NEXT: add x29, sp, #32 +; CHECK-TAIL: str w{{[0-9]+}}, [sp] call fastcc void @func_stack8([8 x i32] undef, i32 42) @@ -42,27 +45,29 @@ ; CHECK-TAIL-NOT: sub sp, sp ret void -; CHECK: add sp, sp, #32 -; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK: ldp x29, x30, [sp, #32] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret -; CHECK-TAIL: add sp, sp, #32 -; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16 +; CHECK-TAIL: ldp x29, x30, [sp, #32] +; CHECK-TAIL-NEXT: add sp, sp, #48 ; CHECK-TAIL-NEXT: ret } define fastcc void @func_stack8([8 x i32], i32 %stacked) { ; CHECK-LABEL: func_stack8: -; CHECK: stp x29, x30, [sp, #-16]! -; CHECK: mov x29, sp -; CHECK: str w{{[0-9]+}}, [sp, #-32]! +; CHECK: sub sp, sp, #48 +; CHECK-NEXT: stp x29, x30, [sp, #32] +; CHECK-NEXT: add x29, sp, #32 +; CHECK: str w{{[0-9]+}}, [sp] ; CHECK-TAIL-LABEL: func_stack8: -; CHECK-TAIL: stp x29, x30, [sp, #-16]! -; CHECK-TAIL: mov x29, sp -; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]! +; CHECK-TAIL: sub sp, sp, #48 +; CHECK-TAIL: stp x29, x30, [sp, #32] +; CHECK-TAIL: add x29, sp, #32 +; CHECK-TAIL: str w{{[0-9]+}}, [sp] call fastcc void @func_stack8([8 x i32] undef, i32 42) @@ -91,23 +96,24 @@ ; CHECK-TAIL-NOT: sub sp, sp ret void -; CHECK: add sp, sp, #32 -; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK: ldp x29, x30, [sp, #32] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret -; CHECK-TAIL: add sp, sp, #32 -; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16 -; CHECK-TAIL-NEXT: add sp, sp, #16 +; CHECK-TAIL: ldp x29, x30, [sp, #32] +; CHECK-TAIL-NEXT: add sp, sp, #64 ; CHECK-TAIL-NEXT: ret } define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) { ; CHECK-LABEL: func_stack32: -; CHECK: mov x29, sp +; CHECK: sub sp, sp, #48 +; CHECK-NEXT: stp x29, x30, [sp, #32] +; CHECK-NEXT: add x29, sp, #32 ; CHECK-TAIL-LABEL: func_stack32: -; CHECK-TAIL: mov x29, sp +; CHECK-TAIL: add x29, sp, #32 call fastcc void @func_stack8([8 x i32] undef, i32 42) @@ -136,13 +142,12 @@ ; CHECK-TAIL-NOT: sub sp, sp ret void -; CHECK: add sp, sp, #32 -; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK: ldp x29, x30, [sp, #32] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret -; CHECK-TAIL: add sp, sp, #32 -; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16 -; CHECK-TAIL-NEXT: add sp, sp, #32 +; CHECK-TAIL: ldp x29, x30, [sp, #32] +; CHECK-TAIL-NEXT: add sp, sp, #80 ; CHECK-TAIL-NEXT: ret } @@ -180,22 +185,21 @@ ; Check that arg stack pop is done after callee-save restore when no frame pointer is used. define fastcc void @func_stack32_leaf_local([8 x i32], i128 %stacked0, i128 %stacked1) { ; CHECK-LABEL: func_stack32_leaf_local: -; CHECK: str x20, [sp, #-16]! -; CHECK-NEXT: sub sp, sp, #16 +; CHECK: sub sp, sp, #32 +; CHECK-NEXT: str x20, [sp, #24] ; CHECK: nop ; CHECK-NEXT: //NO_APP -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldr x20, [sp], #16 +; CHECK-NEXT: ldr x20, [sp, #24] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; CHECK-TAIL-LABEL: func_stack32_leaf_local: -; CHECK-TAIL: str x20, [sp, #-16]! -; CHECK-TAIL-NEXT: sub sp, sp, #16 +; CHECK-TAIL: sub sp, sp, #32 +; CHECK-TAIL-NEXT: str x20, [sp, #24] ; CHECK-TAIL: nop ; CHECK-TAIL-NEXT: //NO_APP -; CHECK-TAIL-NEXT: add sp, sp, #16 -; CHECK-TAIL-NEXT: ldr x20, [sp], #16 -; CHECK-TAIL-NEXT: add sp, sp, #32 +; CHECK-TAIL-NEXT: ldr x20, [sp, #24] +; CHECK-TAIL-NEXT: add sp, sp, #64 ; CHECK-TAIL-NEXT: ret ; CHECK-TAIL-RZ-LABEL: func_stack32_leaf_local: Index: test/CodeGen/AArch64/func-calls.ll =================================================================== --- test/CodeGen/AArch64/func-calls.ll +++ test/CodeGen/AArch64/func-calls.ll @@ -89,11 +89,11 @@ ; that varstruct is passed on the stack. Rather dependent on how a ; memcpy gets created, but the following works for now. -; CHECK-DAG: str {{q[0-9]+}}, [sp, #-16] +; CHECK-DAG: str {{q[0-9]+}}, [sp] ; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0 ; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b -; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp, #-16]! +; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp] ; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0 ; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]] Index: test/CodeGen/AArch64/remat.ll =================================================================== --- test/CodeGen/AArch64/remat.ll +++ test/CodeGen/AArch64/remat.ll @@ -5,7 +5,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=exynos-m1 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=kryo -o - %s | FileCheck %s -%X = type { i64, i64, i64 } +%X = type { i64, i64, i64, i64 } declare void @f(%X*) define void @t() { entry: Index: test/CodeGen/AArch64/tailcall-implicit-sret.ll =================================================================== --- test/CodeGen/AArch64/tailcall-implicit-sret.ll +++ test/CodeGen/AArch64/tailcall-implicit-sret.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s +; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false -disable-post-ra | FileCheck %s ; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks. target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" Index: test/DebugInfo/AArch64/prologue_end.ll =================================================================== --- test/DebugInfo/AArch64/prologue_end.ll +++ test/DebugInfo/AArch64/prologue_end.ll @@ -9,9 +9,9 @@ define void @prologue_end_test() nounwind uwtable !dbg !4 { ; CHECK: prologue_end_test: ; CHECK: .cfi_startproc - ; CHECK: stp x29, x30 - ; CHECK: mov x29, sp ; CHECK: sub sp, sp + ; CHECK: stp x29, x30 + ; CHECK: add x29, sp ; CHECK: .loc 1 3 3 prologue_end ; CHECK: bl _func ; CHECK: bl _func