Index: llvm/trunk/lib/Target/AArch64/AArch64FrameLowering.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64FrameLowering.h +++ llvm/trunk/lib/Target/AArch64/AArch64FrameLowering.h @@ -66,6 +66,10 @@ bool enableShrinkWrapping(const MachineFunction &MF) const override { return true; } + +private: + bool shouldCombineCSRLocalStackBump(MachineFunction &MF, + unsigned StackBumpBytes) const; }; } // End llvm namespace Index: llvm/trunk/lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64FrameLowering.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -283,6 +283,127 @@ return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister; } +bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( + MachineFunction &MF, unsigned StackBumpBytes) const { + AArch64FunctionInfo *AFI = MF.getInfo(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + + if (AFI->getLocalStackSize() == 0) + return false; + + // 512 is the maximum immediate for stp/ldp that will be used for + // callee-save save/restores + if (StackBumpBytes >= 512) + return false; + + if (MFI->hasVarSizedObjects()) + return false; + + if (RegInfo->needsStackRealignment(MF)) + return false; + + // This isn't strictly necessary, but it simplifies things a bit since the + // current RedZone handling code assumes the SP is adjusted by the + // callee-save save/restore code. + if (canUseRedZone(MF)) + return false; + + return true; +} + +// Convert callee-save register save/restore instruction to do stack pointer +// decrement/increment to allocate/deallocate the callee-save stack area by +// converting store/load to use pre/post increment version. +static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, + const TargetInstrInfo *TII, int CSStackSizeInc) { + + unsigned NewOpc; + bool NewIsUnscaled = false; + switch (MBBI->getOpcode()) { + default: + llvm_unreachable("Unexpected callee-save save/restore opcode!"); + case AArch64::STPXi: + NewOpc = AArch64::STPXpre; + break; + case AArch64::STPDi: + NewOpc = AArch64::STPDpre; + break; + case AArch64::STRXui: + NewOpc = AArch64::STRXpre; + NewIsUnscaled = true; + break; + case AArch64::STRDui: + NewOpc = AArch64::STRDpre; + NewIsUnscaled = true; + break; + case AArch64::LDPXi: + NewOpc = AArch64::LDPXpost; + break; + case AArch64::LDPDi: + NewOpc = AArch64::LDPDpost; + break; + case AArch64::LDRXui: + NewOpc = AArch64::LDRXpost; + NewIsUnscaled = true; + break; + case AArch64::LDRDui: + NewOpc = AArch64::LDRDpost; + NewIsUnscaled = true; + break; + } + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); + MIB.addReg(AArch64::SP, RegState::Define); + + // Copy all operands other than the immediate offset. + unsigned OpndIdx = 0; + for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd; + ++OpndIdx) + MIB.addOperand(MBBI->getOperand(OpndIdx)); + + assert(MBBI->getOperand(OpndIdx).getImm() == 0 && + "Unexpected immediate offset in first/last callee-save save/restore " + "instruction!"); + assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP && + "Unexpected base register in callee-save save/restore instruction!"); + // Last operand is immediate offset that needs fixing. + assert(CSStackSizeInc % 8 == 0); + int64_t CSStackSizeIncImm = CSStackSizeInc; + if (!NewIsUnscaled) + CSStackSizeIncImm /= 8; + MIB.addImm(CSStackSizeIncImm); + + MIB.setMIFlags(MBBI->getFlags()); + MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end()); + + return std::prev(MBB.erase(MBBI)); +} + +// Fixup callee-save register save/restore instructions to take into account +// combined SP bump by adding the local stack size to the stack offsets. +static void fixupCalleeSaveRestoreStackOffset(MachineInstr *MI, + unsigned LocalStackSize) { + unsigned Opc = MI->getOpcode(); + (void)Opc; + assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi || + Opc == AArch64::STRXui || Opc == AArch64::STRDui || + Opc == AArch64::LDPXi || Opc == AArch64::LDPDi || + Opc == AArch64::LDRXui || Opc == AArch64::LDRDui) && + "Unexpected callee-save save/restore opcode!"); + + unsigned OffsetIdx = MI->getNumExplicitOperands() - 1; + assert(MI->getOperand(OffsetIdx - 1).getReg() == AArch64::SP && + "Unexpected base register in callee-save save/restore instruction!"); + // Last operand is immediate offset that needs fixing. + MachineOperand &OffsetOpnd = MI->getOperand(OffsetIdx); + // All generated opcodes have scaled offsets. + assert(LocalStackSize % 8 == 0); + OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8); +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -334,18 +455,36 @@ return; } - NumBytes -= AFI->getCalleeSavedStackSize(); - assert(NumBytes >= 0 && "Negative stack allocation size!?"); + auto CSStackSize = AFI->getCalleeSavedStackSize(); // All of the remaining stack allocations are for locals. - AFI->setLocalStackSize(NumBytes); + AFI->setLocalStackSize(NumBytes - CSStackSize); - // Move past the saves of the callee-saved registers. + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + if (CombineSPBump) { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); + NumBytes = 0; + } else if (CSStackSize != 0) { + MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII, + -CSStackSize); + NumBytes -= CSStackSize; + } + assert(NumBytes >= 0 && "Negative stack allocation size!?"); + + // Move past the saves of the callee-saved registers, fixing up the offsets + // and pre-inc if we decided to combine the callee-save and local stack + // pointer bump above. MachineBasicBlock::iterator End = MBB.end(); - while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) + while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) { + if (CombineSPBump) + fixupCalleeSaveRestoreStackOffset(MBBI, AFI->getLocalStackSize()); ++MBBI; + } if (HasFP) { // Only set up FP if we actually need to. Frame pointer is fp = sp - 16. - int FPOffset = AFI->getCalleeSavedStackSize() - 16; + int FPOffset = CSStackSize - 16; + if (CombineSPBump) + FPOffset += AFI->getLocalStackSize(); // Issue sub fp, sp, FPOffset or // mov fp,sp when FPOffset is zero. @@ -569,6 +708,13 @@ // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps // it as the 2nd argument of AArch64ISD::TC_RETURN. + auto CSStackSize = AFI->getCalleeSavedStackSize(); + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + + if (!CombineSPBump && CSStackSize != 0) + convertCalleeSaveRestoreToSPPrePostIncDec( + MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize); + // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); MachineBasicBlock::iterator Begin = MBB.begin(); @@ -577,9 +723,19 @@ if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) { ++LastPopI; break; - } + } else if (CombineSPBump) + fixupCalleeSaveRestoreStackOffset(LastPopI, AFI->getLocalStackSize()); } - NumBytes -= AFI->getCalleeSavedStackSize(); + + // If there is a single SP update, insert it before the ret and we're done. + if (CombineSPBump) { + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + NumBytes + ArgumentPopSize, TII, + MachineInstr::FrameDestroy); + return; + } + + NumBytes -= CSStackSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); if (!hasFP(MF)) { @@ -589,7 +745,7 @@ if (RedZone && ArgumentPopSize == 0) return; - bool NoCalleeSaveRestore = AFI->getCalleeSavedStackSize() == 0; + bool NoCalleeSaveRestore = CSStackSize == 0; int StackRestoreBytes = RedZone ? 0 : NumBytes; if (NoCalleeSaveRestore) StackRestoreBytes += ArgumentPopSize; @@ -608,8 +764,7 @@ // be able to save any instructions. if (MFI->hasVarSizedObjects() || AFI->isStackRealigned()) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -AFI->getCalleeSavedStackSize() + 16, TII, - MachineInstr::FrameDestroy); + -CSStackSize + 16, TII, MachineInstr::FrameDestroy); else if (NumBytes) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII, MachineInstr::FrameDestroy); @@ -799,14 +954,6 @@ if (RPI.isPaired()) ++i; } - - // Align first offset to even 16-byte boundary to avoid additional SP - // adjustment instructions. - // Last pair offset is size of whole callee-save region for SP - // pre-dec/post-inc. - RegPairInfo &LastPair = RegPairs.back(); - assert(AFI->getCalleeSavedStackSize() % 8 == 0); - LastPair.Offset = AFI->getCalleeSavedStackSize() / 8; } bool AArch64FrameLowering::spillCalleeSavedRegisters( @@ -827,29 +974,20 @@ unsigned Reg2 = RPI.Reg2; unsigned StrOpc; - // Issue sequence of non-sp increment and pi sp spills for cs regs. The - // first spill is a pre-increment that allocates the stack. + // Issue sequence of spills for cs regs. The first spill may be converted + // to a pre-decrement store later by emitPrologue if the callee-save stack + // area allocation can't be combined with the local stack area allocation. // For example: - // stp x22, x21, [sp, #-48]! // addImm(-6) + // stp x22, x21, [sp, #0] // addImm(+0) // stp x20, x19, [sp, #16] // addImm(+2) // stp fp, lr, [sp, #32] // addImm(+4) // Rationale: This sequence saves uop updates compared to a sequence of // pre-increment spills like stp xi,xj,[sp,#-16]! // Note: Similar rationale and sequence for restores in epilog. - bool BumpSP = RPII == RegPairs.rbegin(); - if (RPI.IsGPR) { - // For first spill use pre-increment store. - if (BumpSP) - StrOpc = RPI.isPaired() ? AArch64::STPXpre : AArch64::STRXpre; - else - StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; - } else { - // For first spill use pre-increment store. - if (BumpSP) - StrOpc = RPI.isPaired() ? AArch64::STPDpre : AArch64::STRDpre; - else - StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; - } + if (RPI.IsGPR) + StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; + else + StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1); if (RPI.isPaired()) dbgs() << ", " << TRI->getName(Reg2); @@ -858,29 +996,19 @@ dbgs() << ", " << RPI.FrameIdx+1; dbgs() << ")\n"); - const int Offset = BumpSP ? -RPI.Offset : RPI.Offset; MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); - if (BumpSP) - MIB.addReg(AArch64::SP, RegState::Define); - + MBB.addLiveIn(Reg1); if (RPI.isPaired()) { - MBB.addLiveIn(Reg1); MBB.addLiveIn(Reg2); - MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)) - .addReg(Reg1, getPrologueDeath(MF, Reg1)) - .addReg(AArch64::SP) - .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit - .setMIFlag(MachineInstr::FrameSetup); + MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), MachineMemOperand::MOStore, 8, 8)); - } else { - MBB.addLiveIn(Reg1); - MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) + } + MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) .addReg(AArch64::SP) - .addImm(BumpSP ? Offset * 8 : Offset) // pre-inc version is unscaled + .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit .setMIFlag(MachineInstr::FrameSetup); - } MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), MachineMemOperand::MOStore, 8, 8)); @@ -908,26 +1036,19 @@ unsigned Reg1 = RPI.Reg1; unsigned Reg2 = RPI.Reg2; - // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only - // the last load is sp-pi post-increment and de-allocates the stack: + // Issue sequence of restores for cs regs. The last restore may be converted + // to a post-increment load later by emitEpilogue if the callee-save stack + // area allocation can't be combined with the local stack area allocation. // For example: // ldp fp, lr, [sp, #32] // addImm(+4) // ldp x20, x19, [sp, #16] // addImm(+2) - // ldp x22, x21, [sp], #48 // addImm(+6) + // ldp x22, x21, [sp, #0] // addImm(+0) // Note: see comment in spillCalleeSavedRegisters() unsigned LdrOpc; - bool BumpSP = RPII == std::prev(RegPairs.end()); - if (RPI.IsGPR) { - if (BumpSP) - LdrOpc = RPI.isPaired() ? AArch64::LDPXpost : AArch64::LDRXpost; - else - LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; - } else { - if (BumpSP) - LdrOpc = RPI.isPaired() ? AArch64::LDPDpost : AArch64::LDRDpost; - else - LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; - } + if (RPI.IsGPR) + LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; + else + LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1); if (RPI.isPaired()) dbgs() << ", " << TRI->getName(Reg2); @@ -936,27 +1057,17 @@ dbgs() << ", " << RPI.FrameIdx+1; dbgs() << ")\n"); - const int Offset = RPI.Offset; MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); - if (BumpSP) - MIB.addReg(AArch64::SP, RegState::Define); - if (RPI.isPaired()) { - MIB.addReg(Reg2, getDefRegState(true)) - .addReg(Reg1, getDefRegState(true)) - .addReg(AArch64::SP) - .addImm(Offset) // [sp], #offset * 8 or [sp, #offset * 8] - // where the factor * 8 is implicit - .setMIFlag(MachineInstr::FrameDestroy); + MIB.addReg(Reg2, getDefRegState(true)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), MachineMemOperand::MOLoad, 8, 8)); - } else { - MIB.addReg(Reg1, getDefRegState(true)) + } + MIB.addReg(Reg1, getDefRegState(true)) .addReg(AArch64::SP) - .addImm(BumpSP ? Offset * 8 : Offset) // post-dec version is unscaled + .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit .setMIFlag(MachineInstr::FrameDestroy); - } MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), MachineMemOperand::MOLoad, 8, 8)); Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2393,6 +2393,9 @@ if (DestReg == SrcReg && Offset == 0) return; + assert((DestReg != AArch64::SP || Offset % 16 == 0) && + "SP increment/decrement not 16-byte aligned"); + bool isSub = Offset < 0; if (isSub) Offset = -Offset; Index: llvm/trunk/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll +++ llvm/trunk/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll @@ -98,8 +98,8 @@ ; CHECK-LABEL: novla_nodynamicrealign_call ; CHECK: .cfi_startproc ; Check that used callee-saved registers are saved -; CHECK: stp x19, x30, [sp, #-16]! -; CHECK: sub sp, sp, #16 +; CHECK: sub sp, sp, #32 +; CHECK: stp x19, x30, [sp, #16] ; Check correctness of cfi pseudo-instructions ; CHECK: .cfi_def_cfa_offset 32 ; CHECK: .cfi_offset w30, -8 @@ -110,17 +110,18 @@ ; Check correct access to local variable on the stack, through stack pointer ; CHECK: ldr w[[ILOC:[0-9]+]], [sp, #12] ; Check epilogue: -; CHECK: ldp x19, x30, [sp], #16 +; CHECK: ldp x19, x30, [sp, #16] ; CHECK: ret ; CHECK: .cfi_endproc ; CHECK-MACHO-LABEL: _novla_nodynamicrealign_call: ; CHECK-MACHO: .cfi_startproc ; Check that used callee-saved registers are saved -; CHECK-MACHO: stp x20, x19, [sp, #-32]! +; CHECK-MACHO: sub sp, sp, #48 +; CHECK-MACHO: stp x20, x19, [sp, #16] ; Check that the frame pointer is created: -; CHECK-MACHO: stp x29, x30, [sp, #16] -; CHECK-MACHO: add x29, sp, #16 +; CHECK-MACHO: stp x29, x30, [sp, #32] +; CHECK-MACHO: add x29, sp, #32 ; Check correctness of cfi pseudo-instructions ; CHECK-MACHO: .cfi_def_cfa w29, 16 ; CHECK-MACHO: .cfi_offset w30, -8 @@ -133,8 +134,8 @@ ; Check correct access to local variable on the stack, through stack pointer ; CHECK-MACHO: ldr w[[ILOC:[0-9]+]], [sp, #12] ; Check epilogue: -; CHECK-MACHO: ldp x29, x30, [sp, #16] -; CHECK-MACHO: ldp x20, x19, [sp], #32 +; CHECK-MACHO: ldp x29, x30, [sp, #32] +; CHECK-MACHO: ldp x20, x19, [sp, #16] ; CHECK-MACHO: ret ; CHECK-MACHO: .cfi_endproc Index: llvm/trunk/test/CodeGen/AArch64/arm64-aapcs-be.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-aapcs-be.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-aapcs-be.ll @@ -32,7 +32,8 @@ define void @test_block_addr_callee() { ; CHECK-LABEL: test_block_addr_callee: -; CHECK: str {{[a-z0-9]+}}, [sp, #-16]! +; CHECK: sub sp, sp, #32 +; CHECK: str {{[a-z0-9]+}}, [sp, #16] ; CHECK: bl test_block_addr %val = insertvalue [1 x float] undef, float 0.0, 0 call float @test_block_addr([8 x float] undef, [1 x float] %val) Index: llvm/trunk/test/CodeGen/AArch64/arm64-abi.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-abi.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-abi.ll @@ -130,7 +130,7 @@ ; CHECK-LABEL: test3 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8] ; FAST-LABEL: test3 -; FAST: sub sp, sp, #32 +; FAST: sub sp, sp, #48 ; FAST: mov x[[ADDR:[0-9]+]], sp ; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8] %0 = load <2 x i32>, <2 x i32>* %in, align 8 Index: llvm/trunk/test/CodeGen/AArch64/arm64-abi_align.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-abi_align.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-abi_align.ll @@ -291,7 +291,7 @@ ; Space for s2 is allocated at sp ; FAST-LABEL: caller42 -; FAST: sub sp, sp, #96 +; FAST: sub sp, sp, #112 ; Space for s1 is allocated at fp-24 = sp+72 ; Space for s2 is allocated at sp+48 ; FAST: sub x[[A:[0-9]+]], x29, #24 @@ -317,8 +317,8 @@ define i32 @caller42_stack() #3 { entry: ; CHECK-LABEL: caller42_stack -; CHECK: mov x29, sp -; CHECK: sub sp, sp, #96 +; CHECK: sub sp, sp, #112 +; CHECK: add x29, sp, #96 ; CHECK: stur {{x[0-9]+}}, [x29, #-16] ; CHECK: stur {{q[0-9]+}}, [x29, #-32] ; CHECK: str {{x[0-9]+}}, [sp, #48] @@ -399,7 +399,7 @@ ; Space for s2 is allocated at sp ; FAST-LABEL: caller43 -; FAST: mov x29, sp +; FAST: add x29, sp, #64 ; Space for s1 is allocated at sp+32 ; Space for s2 is allocated at sp ; FAST: add x1, sp, #32 @@ -429,8 +429,8 @@ define i32 @caller43_stack() #3 { entry: ; CHECK-LABEL: caller43_stack -; CHECK: mov x29, sp -; CHECK: sub sp, sp, #96 +; CHECK: sub sp, sp, #112 +; CHECK: add x29, sp, #96 ; CHECK: stur {{q[0-9]+}}, [x29, #-16] ; CHECK: stur {{q[0-9]+}}, [x29, #-32] ; CHECK: str {{q[0-9]+}}, [sp, #48] @@ -446,7 +446,7 @@ ; CHECK: str w[[C]], [sp] ; FAST-LABEL: caller43_stack -; FAST: sub sp, sp, #96 +; FAST: sub sp, sp, #112 ; Space for s1 is allocated at fp-32 = sp+64 ; Space for s2 is allocated at sp+32 ; FAST: sub x[[A:[0-9]+]], x29, #32 @@ -508,7 +508,7 @@ ; "i64 %0" should be in register x7. ; "i32 8" should be on stack at [sp]. ; CHECK: ldr x7, [{{x[0-9]+}}] -; CHECK: str {{w[0-9]+}}, [sp, #-16]! +; CHECK: str {{w[0-9]+}}, [sp] ; FAST-LABEL: i64_split ; FAST: ldr x7, [{{x[0-9]+}}] ; FAST: mov x[[R0:[0-9]+]], sp Index: llvm/trunk/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll @@ -14,7 +14,7 @@ define void @main() nounwind { entry: ; CHECK: main -; CHECK: mov x29, sp +; CHECK: add x29, sp, #16 ; CHECK: mov [[REG:x[0-9]+]], sp ; CHECK-NEXT: add x0, [[REG]], #8 %E = alloca %struct.S2Ty, align 4 Index: llvm/trunk/test/CodeGen/AArch64/arm64-hello.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-hello.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-hello.ll @@ -2,26 +2,26 @@ ; RUN: llc < %s -mtriple=arm64-linux-gnu -disable-post-ra | FileCheck %s --check-prefix=CHECK-LINUX ; CHECK-LABEL: main: -; CHECK: stp x29, x30, [sp, #-16]! -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 +; CHECK: sub sp, sp, #32 +; CHECK-NEXT: stp x29, x30, [sp, #16] +; CHECK-NEXT: add x29, sp, #16 ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK: adrp x0, L_.str@PAGE ; CHECK: add x0, x0, L_.str@PAGEOFF ; CHECK-NEXT: bl _puts -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ldp x29, x30, [sp, #16] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; CHECK-LINUX-LABEL: main: -; CHECK-LINUX: str x30, [sp, #-16]! -; CHECK-LINUX-NEXT: sub sp, sp, #16 +; CHECK-LINUX: sub sp, sp, #32 +; CHECK-LINUX-NEXT: str x30, [sp, #16] ; CHECK-LINUX-NEXT: str wzr, [sp, #12] ; CHECK-LINUX: adrp x0, .L.str ; CHECK-LINUX: add x0, x0, :lo12:.L.str ; CHECK-LINUX-NEXT: bl puts -; CHECK-LINUX-NEXT: add sp, sp, #16 -; CHECK-LINUX-NEXT: ldr x30, [sp], #16 +; CHECK-LINUX-NEXT: ldr x30, [sp, #16] +; CHECK-LINUX-NEXT: add sp, sp, #32 ; CHECK-LINUX-NEXT: ret @.str = private unnamed_addr constant [7 x i8] c"hello\0A\00" Index: llvm/trunk/test/CodeGen/AArch64/arm64-join-reserved.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-join-reserved.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-join-reserved.ll @@ -5,7 +5,7 @@ ; A move isn't necessary. ; ; CHECK-LABEL: g: -; CHECK: str xzr, [sp, #-16]! +; CHECK: str xzr, [sp] ; CHECK: bl ; CHECK: ret define void @g() nounwind ssp { Index: llvm/trunk/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll @@ -7,7 +7,7 @@ entry: ; CHECK-LABEL: jscall_patchpoint_codegen: ; CHECK: Ltmp -; CHECK: str x{{.+}}, [sp, #-16]! +; CHECK: str x{{.+}}, [sp] ; CHECK-NEXT: mov x0, x{{.+}} ; CHECK: Ltmp ; CHECK-NEXT: movz x16, #0xffff, lsl #32 @@ -16,7 +16,7 @@ ; CHECK-NEXT: blr x16 ; FAST-LABEL: jscall_patchpoint_codegen: ; FAST: Ltmp -; FAST: str x{{.+}}, [sp, #-16]! +; FAST: str x{{.+}}, [sp] ; FAST: Ltmp ; FAST-NEXT: movz x16, #0xffff, lsl #32 ; FAST-NEXT: movk x16, #0xdead, lsl #16 @@ -50,7 +50,7 @@ ; FAST: orr [[REG1:x[0-9]+]], xzr, #0x2 ; FAST-NEXT: orr [[REG2:w[0-9]+]], wzr, #0x4 ; FAST-NEXT: orr [[REG3:x[0-9]+]], xzr, #0x6 -; FAST-NEXT: str [[REG1]], [sp, #-32]! +; FAST-NEXT: str [[REG1]], [sp] ; FAST-NEXT: str [[REG2]], [sp, #16] ; FAST-NEXT: str [[REG3]], [sp, #24] ; FAST: Ltmp @@ -90,7 +90,7 @@ ; FAST-NEXT: orr [[REG3:x[0-9]+]], xzr, #0x6 ; FAST-NEXT: orr [[REG4:w[0-9]+]], wzr, #0x8 ; FAST-NEXT: movz [[REG5:x[0-9]+]], #0xa -; FAST-NEXT: str [[REG1]], [sp, #-64]! +; FAST-NEXT: str [[REG1]], [sp] ; FAST-NEXT: str [[REG2]], [sp, #16] ; FAST-NEXT: str [[REG3]], [sp, #24] ; FAST-NEXT: str [[REG4]], [sp, #36] Index: llvm/trunk/test/CodeGen/AArch64/arm64-patchpoint.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-patchpoint.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-patchpoint.ll @@ -26,10 +26,11 @@ ; as a leaf function. ; ; CHECK-LABEL: caller_meta_leaf -; CHECK: mov x29, sp -; CHECK-NEXT: sub sp, sp, #32 +; CHECK: sub sp, sp, #48 +; CHECK-NEXT: stp x29, x30, [sp, #32] +; CHECK-NEXT: add x29, sp, #32 ; CHECK: Ltmp -; CHECK: add sp, sp, #32 +; CHECK: add sp, sp, #48 ; CHECK: ret define void @caller_meta_leaf() { Index: llvm/trunk/test/CodeGen/AArch64/arm64-shrink-wrapping.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -13,9 +13,9 @@ ; ENABLE-NEXT: b.ge [[EXIT_LABEL:LBB[0-9_]+]] ; ; Prologue code. -; CHECK: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #-16]! -; CHECK-NEXT: mov [[SAVE_SP]], sp -; CHECK-NEXT: sub sp, sp, #16 +; CHECK: sub sp, sp, #32 +; CHECK-NEXT: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #16] +; CHECK-NEXT: add [[SAVE_SP]], sp, #16 ; ; Compare the arguments and jump to exit. ; After the prologue is set. @@ -33,8 +33,8 @@ ; Without shrink-wrapping, epilogue is in the exit block. ; DISABLE: [[EXIT_LABEL]]: ; Epilogue code. -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x{{[0-9]+}}, [[CSR]], [sp], #16 +; CHECK-NEXT: ldp x{{[0-9]+}}, [[CSR]], [sp, #16] +; CHECK-NEXT: add sp, sp, #32 ; ; With shrink-wrapping, exit block is a simple return. ; ENABLE: [[EXIT_LABEL]]: @@ -454,9 +454,9 @@ ; ENABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]] ; ; Prologue code. -; CHECK: stp [[CSR1:x[0-9]+]], [[CSR2:x[0-9]+]], [sp, #-16]! -; CHECK-NEXT: mov [[NEW_SP:x[0-9]+]], sp -; CHECK-NEXT: sub sp, sp, #48 +; CHECK: sub sp, sp, #64 +; CHECK-NEXT: stp [[CSR1:x[0-9]+]], [[CSR2:x[0-9]+]], [sp, #48] +; CHECK-NEXT: add [[NEW_SP:x[0-9]+]], sp, #48 ; ; DISABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]] ; Setup of the varags. @@ -473,8 +473,8 @@ ; DISABLE: [[IFEND_LABEL]]: ; %if.end ; ; Epilogue code. -; CHECK: add sp, sp, #48 -; CHECK-NEXT: ldp [[CSR1]], [[CSR2]], [sp], #16 +; CHECK: ldp [[CSR1]], [[CSR2]], [sp, #48] +; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret ; ; ENABLE: [[ELSE_LABEL]]: ; %if.else Index: llvm/trunk/test/CodeGen/AArch64/fastcc.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/fastcc.ll +++ llvm/trunk/test/CodeGen/AArch64/fastcc.ll @@ -7,13 +7,15 @@ define fastcc void @func_stack0() { ; CHECK-LABEL: func_stack0: -; CHECK: mov x29, sp -; CHECK: str w{{[0-9]+}}, [sp, #-32]! +; CHECK: sub sp, sp, #48 +; CHECK: add x29, sp, #32 +; CHECK: str w{{[0-9]+}}, [sp] ; CHECK-TAIL-LABEL: func_stack0: -; CHECK-TAIL: stp x29, x30, [sp, #-16]! -; CHECK-TAIL-NEXT: mov x29, sp -; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]! +; CHECK-TAIL: sub sp, sp, #48 +; CHECK-TAIL-NEXT: stp x29, x30, [sp, #32] +; CHECK-TAIL-NEXT: add x29, sp, #32 +; CHECK-TAIL: str w{{[0-9]+}}, [sp] call fastcc void @func_stack8([8 x i32] undef, i32 42) @@ -42,27 +44,29 @@ ; CHECK-TAIL-NOT: sub sp, sp ret void -; CHECK: add sp, sp, #32 -; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK: ldp x29, x30, [sp, #32] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret -; CHECK-TAIL: add sp, sp, #32 -; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16 +; CHECK-TAIL: ldp x29, x30, [sp, #32] +; CHECK-TAIL-NEXT: add sp, sp, #48 ; CHECK-TAIL-NEXT: ret } define fastcc void @func_stack8([8 x i32], i32 %stacked) { ; CHECK-LABEL: func_stack8: -; CHECK: stp x29, x30, [sp, #-16]! -; CHECK: mov x29, sp -; CHECK: str w{{[0-9]+}}, [sp, #-32]! +; CHECK: sub sp, sp, #48 +; CHECK: stp x29, x30, [sp, #32] +; CHECK: add x29, sp, #32 +; CHECK: str w{{[0-9]+}}, [sp] ; CHECK-TAIL-LABEL: func_stack8: -; CHECK-TAIL: stp x29, x30, [sp, #-16]! -; CHECK-TAIL: mov x29, sp -; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]! +; CHECK-TAIL: sub sp, sp, #48 +; CHECK-TAIL: stp x29, x30, [sp, #32] +; CHECK-TAIL: add x29, sp, #32 +; CHECK-TAIL: str w{{[0-9]+}}, [sp] call fastcc void @func_stack8([8 x i32] undef, i32 42) @@ -91,23 +95,22 @@ ; CHECK-TAIL-NOT: sub sp, sp ret void -; CHECK: add sp, sp, #32 -; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ldp x29, x30, [sp, #32] +; CHECK: add sp, sp, #48 ; CHECK-NEXT: ret -; CHECK-TAIL: add sp, sp, #32 -; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16 -; CHECK-TAIL-NEXT: add sp, sp, #16 +; CHECK-TAIL: ldp x29, x30, [sp, #32] +; CHECK-TAIL-NEXT: add sp, sp, #64 ; CHECK-TAIL-NEXT: ret } define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) { ; CHECK-LABEL: func_stack32: -; CHECK: mov x29, sp +; CHECK: add x29, sp, #32 ; CHECK-TAIL-LABEL: func_stack32: -; CHECK-TAIL: mov x29, sp +; CHECK-TAIL: add x29, sp, #32 call fastcc void @func_stack8([8 x i32] undef, i32 42) @@ -136,13 +139,12 @@ ; CHECK-TAIL-NOT: sub sp, sp ret void -; CHECK: add sp, sp, #32 -; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK: ldp x29, x30, [sp, #32] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret -; CHECK-TAIL: add sp, sp, #32 -; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16 -; CHECK-TAIL-NEXT: add sp, sp, #32 +; CHECK-TAIL: ldp x29, x30, [sp, #32] +; CHECK-TAIL-NEXT: add sp, sp, #80 ; CHECK-TAIL-NEXT: ret } @@ -180,22 +182,21 @@ ; Check that arg stack pop is done after callee-save restore when no frame pointer is used. define fastcc void @func_stack32_leaf_local([8 x i32], i128 %stacked0, i128 %stacked1) { ; CHECK-LABEL: func_stack32_leaf_local: -; CHECK: str x20, [sp, #-16]! -; CHECK-NEXT: sub sp, sp, #16 +; CHECK: sub sp, sp, #32 +; CHECK-NEXT: str x20, [sp, #16] ; CHECK: nop ; CHECK-NEXT: //NO_APP -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldr x20, [sp], #16 +; CHECK-NEXT: ldr x20, [sp, #16] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; CHECK-TAIL-LABEL: func_stack32_leaf_local: -; CHECK-TAIL: str x20, [sp, #-16]! -; CHECK-TAIL-NEXT: sub sp, sp, #16 +; CHECK-TAIL: sub sp, sp, #32 +; CHECK-TAIL-NEXT: str x20, [sp, #16] ; CHECK-TAIL: nop ; CHECK-TAIL-NEXT: //NO_APP -; CHECK-TAIL-NEXT: add sp, sp, #16 -; CHECK-TAIL-NEXT: ldr x20, [sp], #16 -; CHECK-TAIL-NEXT: add sp, sp, #32 +; CHECK-TAIL-NEXT: ldr x20, [sp, #16] +; CHECK-TAIL-NEXT: add sp, sp, #64 ; CHECK-TAIL-NEXT: ret ; CHECK-TAIL-RZ-LABEL: func_stack32_leaf_local: Index: llvm/trunk/test/CodeGen/AArch64/func-calls.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/func-calls.ll +++ llvm/trunk/test/CodeGen/AArch64/func-calls.ll @@ -89,11 +89,11 @@ ; that varstruct is passed on the stack. Rather dependent on how a ; memcpy gets created, but the following works for now. -; CHECK-DAG: str {{q[0-9]+}}, [sp, #-16] +; CHECK-DAG: str {{q[0-9]+}}, [sp] ; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0 ; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b -; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp, #-16]! +; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp] ; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0 ; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]] Index: llvm/trunk/test/CodeGen/AArch64/tailcall-implicit-sret.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/tailcall-implicit-sret.ll +++ llvm/trunk/test/CodeGen/AArch64/tailcall-implicit-sret.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s +; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -disable-post-ra -asm-verbose=false | FileCheck %s ; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks. target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" Index: llvm/trunk/test/DebugInfo/AArch64/prologue_end.ll =================================================================== --- llvm/trunk/test/DebugInfo/AArch64/prologue_end.ll +++ llvm/trunk/test/DebugInfo/AArch64/prologue_end.ll @@ -9,9 +9,9 @@ define void @prologue_end_test() nounwind uwtable !dbg !4 { ; CHECK: prologue_end_test: ; CHECK: .cfi_startproc - ; CHECK: stp x29, x30 - ; CHECK: mov x29, sp ; CHECK: sub sp, sp + ; CHECK: stp x29, x30 + ; CHECK: add x29, sp ; CHECK: .loc 1 3 3 prologue_end ; CHECK: bl _func ; CHECK: bl _func