Index: lib/Target/AArch64/AArch64FrameLowering.h =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.h +++ lib/Target/AArch64/AArch64FrameLowering.h @@ -66,6 +66,10 @@ bool enableShrinkWrapping(const MachineFunction &MF) const override { return true; } + +private: + bool shouldCombineCSRLocalStackBump(MachineFunction &MF, + unsigned StackBumpBytes) const; }; } // End llvm namespace Index: lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.cpp +++ lib/Target/AArch64/AArch64FrameLowering.cpp @@ -283,6 +283,142 @@ return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister; } +bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( + MachineFunction &MF, unsigned StackBumpBytes) const { + AArch64FunctionInfo *AFI = MF.getInfo(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + + if (AFI->getLocalStackSize() == 0) + return false; + + // 512 is the maximum immediate for stp/ldp that will be used for + // callee-save save/restores + if (StackBumpBytes >= 512) + return false; + + if (MFI->hasVarSizedObjects()) + return false; + + if (RegInfo->needsStackRealignment(MF)) + return false; + + // This isn't strictly necessary, but it simplifies things a bit since the + // current RedZone handling code assumes the SP is adjusted by the + // callee-save save/restore code. + if (canUseRedZone(MF)) + return false; + + return true; +} + +// Fixup callee-save register save/restore instructions to take into account +// combined SP bump. This means two things: 1) the local stack size is needs to +// be added to the stack offsets and 2) the pre-decrement and post-increment of +// the SP by the first/last store/load needs to be removed. +static MachineBasicBlock::iterator +fixupCalleeSaveRestoreStackOffsetAndIncDec(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineFunction &MF = *MBB.getParent(); + AArch64FunctionInfo *AFI = MF.getInfo(); + unsigned LocalStackSize = AFI->getLocalStackSize(); + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = MBBI->getDebugLoc(); + + unsigned NewOpc; + bool IsUnscaled = false; + bool IsPrePostIncDec = false; + bool IsDec = false; + switch (MBBI->getOpcode()) { + // For pre/post inc/dec, convert to non inc/dec and update the stack offset. + case AArch64::STPXpre: + NewOpc = AArch64::STPXi; + IsPrePostIncDec = true; + IsDec = true; + break; + case AArch64::STPDpre: + NewOpc = AArch64::STPDi; + IsPrePostIncDec = true; + IsDec = true; + break; + case AArch64::STRXpre: + NewOpc = AArch64::STRXui; + IsPrePostIncDec = true; + IsDec = true; + IsUnscaled = true; + break; + case AArch64::STRDpre: + NewOpc = AArch64::STRDui; + IsPrePostIncDec = true; + IsDec = true; + IsUnscaled = true; + break; + case AArch64::LDPXpost: + NewOpc = AArch64::LDPXi; + IsPrePostIncDec = true; + break; + case AArch64::LDPDpost: + NewOpc = AArch64::LDPDi; + IsPrePostIncDec = true; + break; + case AArch64::LDRXpost: + NewOpc = AArch64::LDRXui; + IsPrePostIncDec = true; + IsUnscaled = true; + break; + case AArch64::LDRDpost: + NewOpc = AArch64::LDRDui; + IsPrePostIncDec = true; + IsUnscaled = true; + break; + // For non pre/post inc/dec, just update the stack offset. + case AArch64::STPXi: + case AArch64::STPDi: + case AArch64::STRXui: + case AArch64::STRDui: + case AArch64::LDPXi: + case AArch64::LDRXui: + case AArch64::LDPDi: + case AArch64::LDRDui: + NewOpc = MBBI->getOpcode(); + break; + + default: + assert(!"Unexpected FrameSetup opcode!"); + } + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); + unsigned OpndIdx = 0; + // Skip over SP def when converting to non pre/post inc. + if (IsPrePostIncDec) { + OpndIdx = 1; + assert(MBBI->getOperand(0).getReg() == AArch64::SP); + } + for (; OpndIdx < MBBI->getNumOperands() - 1; ++OpndIdx) + MIB.addOperand(MBBI->getOperand(OpndIdx)); + + int64_t OldStackImm = MBBI->getOperand(OpndIdx).getImm(); + if (IsPrePostIncDec) { + int64_t OldStackOffset = OldStackImm; + if (IsDec) + OldStackOffset *= -1; + if (!IsUnscaled) + OldStackOffset *= 8; + assert(OldStackOffset == AFI->getCalleeSavedStackSize()); + OldStackImm = 0; + } + assert(LocalStackSize % 8 == 0); + // All generated opcodes have scaled offsets. + MIB.addImm(OldStackImm + LocalStackSize / 8); + + MIB.setMIFlags(MBBI->getFlags()); + MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end()); + + return std::prev(MBB.erase(MBBI)); +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -334,18 +470,34 @@ return; } - NumBytes -= AFI->getCalleeSavedStackSize(); - assert(NumBytes >= 0 && "Negative stack allocation size!?"); + auto CSStackSize = AFI->getCalleeSavedStackSize(); // All of the remaining stack allocations are for locals. - AFI->setLocalStackSize(NumBytes); + AFI->setLocalStackSize(NumBytes - CSStackSize); - // Move past the saves of the callee-saved registers. - MachineBasicBlock::iterator End = MBB.end(); - while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + if (CombineSPBump) { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); + NumBytes = 0; + } else { + NumBytes -= CSStackSize; + } + assert(NumBytes >= 0 && "Negative stack allocation size!?"); + + // Move past the saves of the callee-saved registers, fixing up the offsets + // and pre-inc if we decided to combine the callee-save and local stack + // pointer bump above. + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { + if (CombineSPBump) + MBBI = fixupCalleeSaveRestoreStackOffsetAndIncDec(MBB, MBBI); ++MBBI; + } + if (HasFP) { // Only set up FP if we actually need to. Frame pointer is fp = sp - 16. int FPOffset = AFI->getCalleeSavedStackSize() - 16; + if (CombineSPBump) + FPOffset += AFI->getLocalStackSize(); // Issue sub fp, sp, FPOffset or // mov fp,sp when FPOffset is zero. @@ -570,15 +722,25 @@ // it as the 2nd argument of AArch64ISD::TC_RETURN. // Move past the restores of the callee-saved registers. + bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); - MachineBasicBlock::iterator Begin = MBB.begin(); - while (LastPopI != Begin) { + while (LastPopI != MBB.begin()) { --LastPopI; if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) { ++LastPopI; break; - } + } else if (CombineSPBump) + LastPopI = fixupCalleeSaveRestoreStackOffsetAndIncDec(MBB, LastPopI); } + + // If there is a single SP update, insert it before the ret and we're done. + if (CombineSPBump) { + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + NumBytes + ArgumentPopSize, TII, + MachineInstr::FrameDestroy); + return; + } + NumBytes -= AFI->getCalleeSavedStackSize(); assert(NumBytes >= 0 && "Negative stack allocation size!?"); Index: test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll =================================================================== --- test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll +++ test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll @@ -98,8 +98,8 @@ ; CHECK-LABEL: novla_nodynamicrealign_call ; CHECK: .cfi_startproc ; Check that used callee-saved registers are saved -; CHECK: stp x19, x30, [sp, #-16]! -; CHECK: sub sp, sp, #16 +; CHECK: sub sp, sp, #32 +; CHECK: stp x19, x30, [sp, #16] ; Check correctness of cfi pseudo-instructions ; CHECK: .cfi_def_cfa_offset 32 ; CHECK: .cfi_offset w30, -8 @@ -110,17 +110,18 @@ ; Check correct access to local variable on the stack, through stack pointer ; CHECK: ldr w[[ILOC:[0-9]+]], [sp, #12] ; Check epilogue: -; CHECK: ldp x19, x30, [sp], #16 +; CHECK: ldp x19, x30, [sp, #16] ; CHECK: ret ; CHECK: .cfi_endproc ; CHECK-MACHO-LABEL: _novla_nodynamicrealign_call: ; CHECK-MACHO: .cfi_startproc ; Check that used callee-saved registers are saved -; CHECK-MACHO: stp x20, x19, [sp, #-32]! +; CHECK-MACHO: sub sp, sp, #48 +; CHECK-MACHO: stp x20, x19, [sp, #16] ; Check that the frame pointer is created: -; CHECK-MACHO: stp x29, x30, [sp, #16] -; CHECK-MACHO: add x29, sp, #16 +; CHECK-MACHO: stp x29, x30, [sp, #32] +; CHECK-MACHO: add x29, sp, #32 ; Check correctness of cfi pseudo-instructions ; CHECK-MACHO: .cfi_def_cfa w29, 16 ; CHECK-MACHO: .cfi_offset w30, -8 @@ -133,8 +134,8 @@ ; Check correct access to local variable on the stack, through stack pointer ; CHECK-MACHO: ldr w[[ILOC:[0-9]+]], [sp, #12] ; Check epilogue: -; CHECK-MACHO: ldp x29, x30, [sp, #16] -; CHECK-MACHO: ldp x20, x19, [sp], #32 +; CHECK-MACHO: ldp x29, x30, [sp, #32] +; CHECK-MACHO: ldp x20, x19, [sp, #16] ; CHECK-MACHO: ret ; CHECK-MACHO: .cfi_endproc Index: test/CodeGen/AArch64/arm64-aapcs-be.ll =================================================================== --- test/CodeGen/AArch64/arm64-aapcs-be.ll +++ test/CodeGen/AArch64/arm64-aapcs-be.ll @@ -32,7 +32,8 @@ define void @test_block_addr_callee() { ; CHECK-LABEL: test_block_addr_callee: -; CHECK: str {{[a-z0-9]+}}, [sp, #-16]! +; CHECK: sub sp, sp, #32 +; CHECK: str {{[a-z0-9]+}}, [sp, #16] ; CHECK: bl test_block_addr %val = insertvalue [1 x float] undef, float 0.0, 0 call float @test_block_addr([8 x float] undef, [1 x float] %val) Index: test/CodeGen/AArch64/arm64-abi.ll =================================================================== --- test/CodeGen/AArch64/arm64-abi.ll +++ test/CodeGen/AArch64/arm64-abi.ll @@ -130,7 +130,7 @@ ; CHECK-LABEL: test3 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8] ; FAST-LABEL: test3 -; FAST: sub sp, sp, #32 +; FAST: sub sp, sp, #48 ; FAST: mov x[[ADDR:[0-9]+]], sp ; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8] %0 = load <2 x i32>, <2 x i32>* %in, align 8 Index: test/CodeGen/AArch64/arm64-abi_align.ll =================================================================== --- test/CodeGen/AArch64/arm64-abi_align.ll +++ test/CodeGen/AArch64/arm64-abi_align.ll @@ -291,7 +291,7 @@ ; Space for s2 is allocated at sp ; FAST-LABEL: caller42 -; FAST: sub sp, sp, #96 +; FAST: sub sp, sp, #112 ; Space for s1 is allocated at fp-24 = sp+72 ; Space for s2 is allocated at sp+48 ; FAST: sub x[[A:[0-9]+]], x29, #24 @@ -317,8 +317,8 @@ define i32 @caller42_stack() #3 { entry: ; CHECK-LABEL: caller42_stack -; CHECK: mov x29, sp -; CHECK: sub sp, sp, #96 +; CHECK: sub sp, sp, #112 +; CHECK: add x29, sp, #96 ; CHECK: stur {{x[0-9]+}}, [x29, #-16] ; CHECK: stur {{q[0-9]+}}, [x29, #-32] ; CHECK: str {{x[0-9]+}}, [sp, #48] @@ -399,7 +399,7 @@ ; Space for s2 is allocated at sp ; FAST-LABEL: caller43 -; FAST: mov x29, sp +; FAST: add x29, sp, #64 ; Space for s1 is allocated at sp+32 ; Space for s2 is allocated at sp ; FAST: add x1, sp, #32 @@ -429,8 +429,8 @@ define i32 @caller43_stack() #3 { entry: ; CHECK-LABEL: caller43_stack -; CHECK: mov x29, sp -; CHECK: sub sp, sp, #96 +; CHECK: sub sp, sp, #112 +; CHECK: add x29, sp, #96 ; CHECK: stur {{q[0-9]+}}, [x29, #-16] ; CHECK: stur {{q[0-9]+}}, [x29, #-32] ; CHECK: str {{q[0-9]+}}, [sp, #48] @@ -446,7 +446,7 @@ ; CHECK: str w[[C]], [sp] ; FAST-LABEL: caller43_stack -; FAST: sub sp, sp, #96 +; FAST: sub sp, sp, #112 ; Space for s1 is allocated at fp-32 = sp+64 ; Space for s2 is allocated at sp+32 ; FAST: sub x[[A:[0-9]+]], x29, #32 @@ -508,7 +508,7 @@ ; "i64 %0" should be in register x7. ; "i32 8" should be on stack at [sp]. ; CHECK: ldr x7, [{{x[0-9]+}}] -; CHECK: str {{w[0-9]+}}, [sp, #-16]! +; CHECK: str {{w[0-9]+}}, [sp] ; FAST-LABEL: i64_split ; FAST: ldr x7, [{{x[0-9]+}}] ; FAST: mov x[[R0:[0-9]+]], sp Index: test/CodeGen/AArch64/arm64-fast-isel-alloca.ll =================================================================== --- test/CodeGen/AArch64/arm64-fast-isel-alloca.ll +++ test/CodeGen/AArch64/arm64-fast-isel-alloca.ll @@ -14,7 +14,7 @@ define void @main() nounwind { entry: ; CHECK: main -; CHECK: mov x29, sp +; CHECK: add x29, sp, #16 ; CHECK: mov [[REG:x[0-9]+]], sp ; CHECK-NEXT: add x0, [[REG]], #8 %E = alloca %struct.S2Ty, align 4 Index: test/CodeGen/AArch64/arm64-hello.ll =================================================================== --- test/CodeGen/AArch64/arm64-hello.ll +++ test/CodeGen/AArch64/arm64-hello.ll @@ -2,26 +2,26 @@ ; RUN: llc < %s -mtriple=arm64-linux-gnu -disable-post-ra | FileCheck %s --check-prefix=CHECK-LINUX ; CHECK-LABEL: main: -; CHECK: stp x29, x30, [sp, #-16]! -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 +; CHECK: sub sp, sp, #32 +; CHECK-NEXT: stp x29, x30, [sp, #16] +; CHECK-NEXT: add x29, sp, #16 ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK: adrp x0, L_.str@PAGE ; CHECK: add x0, x0, L_.str@PAGEOFF ; CHECK-NEXT: bl _puts -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ldp x29, x30, [sp, #16] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; CHECK-LINUX-LABEL: main: -; CHECK-LINUX: str x30, [sp, #-16]! -; CHECK-LINUX-NEXT: sub sp, sp, #16 +; CHECK-LINUX: sub sp, sp, #32 +; CHECK-LINUX-NEXT: str x30, [sp, #16] ; CHECK-LINUX-NEXT: str wzr, [sp, #12] ; CHECK-LINUX: adrp x0, .L.str ; CHECK-LINUX: add x0, x0, :lo12:.L.str ; CHECK-LINUX-NEXT: bl puts -; CHECK-LINUX-NEXT: add sp, sp, #16 -; CHECK-LINUX-NEXT: ldr x30, [sp], #16 +; CHECK-LINUX-NEXT: ldr x30, [sp, #16] +; CHECK-LINUX-NEXT: add sp, sp, #32 ; CHECK-LINUX-NEXT: ret @.str = private unnamed_addr constant [7 x i8] c"hello\0A\00" Index: test/CodeGen/AArch64/arm64-join-reserved.ll =================================================================== --- test/CodeGen/AArch64/arm64-join-reserved.ll +++ test/CodeGen/AArch64/arm64-join-reserved.ll @@ -5,7 +5,7 @@ ; A move isn't necessary. ; ; CHECK-LABEL: g: -; CHECK: str xzr, [sp, #-16]! +; CHECK: str xzr, [sp] ; CHECK: bl ; CHECK: ret define void @g() nounwind ssp { Index: test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll =================================================================== --- test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll +++ test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll @@ -7,7 +7,7 @@ entry: ; CHECK-LABEL: jscall_patchpoint_codegen: ; CHECK: Ltmp -; CHECK: str x{{.+}}, [sp, #-16]! +; CHECK: str x{{.+}}, [sp] ; CHECK-NEXT: mov x0, x{{.+}} ; CHECK: Ltmp ; CHECK-NEXT: movz x16, #0xffff, lsl #32 @@ -16,7 +16,7 @@ ; CHECK-NEXT: blr x16 ; FAST-LABEL: jscall_patchpoint_codegen: ; FAST: Ltmp -; FAST: str x{{.+}}, [sp, #-16]! +; FAST: str x{{.+}}, [sp] ; FAST: Ltmp ; FAST-NEXT: movz x16, #0xffff, lsl #32 ; FAST-NEXT: movk x16, #0xdead, lsl #16 @@ -50,7 +50,7 @@ ; FAST: orr [[REG1:x[0-9]+]], xzr, #0x2 ; FAST-NEXT: orr [[REG2:w[0-9]+]], wzr, #0x4 ; FAST-NEXT: orr [[REG3:x[0-9]+]], xzr, #0x6 -; FAST-NEXT: str [[REG1]], [sp, #-32]! +; FAST-NEXT: str [[REG1]], [sp] ; FAST-NEXT: str [[REG2]], [sp, #16] ; FAST-NEXT: str [[REG3]], [sp, #24] ; FAST: Ltmp @@ -90,7 +90,7 @@ ; FAST-NEXT: orr [[REG3:x[0-9]+]], xzr, #0x6 ; FAST-NEXT: orr [[REG4:w[0-9]+]], wzr, #0x8 ; FAST-NEXT: movz [[REG5:x[0-9]+]], #0xa -; FAST-NEXT: str [[REG1]], [sp, #-64]! +; FAST-NEXT: str [[REG1]], [sp] ; FAST-NEXT: str [[REG2]], [sp, #16] ; FAST-NEXT: str [[REG3]], [sp, #24] ; FAST-NEXT: str [[REG4]], [sp, #36] Index: test/CodeGen/AArch64/arm64-patchpoint.ll =================================================================== --- test/CodeGen/AArch64/arm64-patchpoint.ll +++ test/CodeGen/AArch64/arm64-patchpoint.ll @@ -26,10 +26,11 @@ ; as a leaf function. ; ; CHECK-LABEL: caller_meta_leaf -; CHECK: mov x29, sp -; CHECK-NEXT: sub sp, sp, #32 +; CHECK: sub sp, sp, #48 +; CHECK-NEXT: stp x29, x30, [sp, #32] +; CHECK-NEXT: add x29, sp, #32 ; CHECK: Ltmp -; CHECK: add sp, sp, #32 +; CHECK: add sp, sp, #48 ; CHECK: ret define void @caller_meta_leaf() { Index: test/CodeGen/AArch64/arm64-shrink-wrapping.ll =================================================================== --- test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -13,9 +13,9 @@ ; ENABLE-NEXT: b.ge [[EXIT_LABEL:LBB[0-9_]+]] ; ; Prologue code. -; CHECK: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #-16]! -; CHECK-NEXT: mov [[SAVE_SP]], sp -; CHECK-NEXT: sub sp, sp, #16 +; CHECK: sub sp, sp, #32 +; CHECK-NEXT: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #16] +; CHECK-NEXT: add [[SAVE_SP]], sp, #16 ; ; Compare the arguments and jump to exit. ; After the prologue is set. @@ -33,8 +33,8 @@ ; Without shrink-wrapping, epilogue is in the exit block. ; DISABLE: [[EXIT_LABEL]]: ; Epilogue code. -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldp x{{[0-9]+}}, [[CSR]], [sp], #16 +; CHECK-NEXT: ldp x{{[0-9]+}}, [[CSR]], [sp, #16] +; CHECK-NEXT: add sp, sp, #32 ; ; With shrink-wrapping, exit block is a simple return. ; ENABLE: [[EXIT_LABEL]]: @@ -454,9 +454,9 @@ ; ENABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]] ; ; Prologue code. -; CHECK: stp [[CSR1:x[0-9]+]], [[CSR2:x[0-9]+]], [sp, #-16]! -; CHECK-NEXT: mov [[NEW_SP:x[0-9]+]], sp -; CHECK-NEXT: sub sp, sp, #48 +; CHECK: sub sp, sp, #64 +; CHECK-NEXT: stp [[CSR1:x[0-9]+]], [[CSR2:x[0-9]+]], [sp, #48] +; CHECK-NEXT: add [[NEW_SP:x[0-9]+]], sp, #48 ; ; DISABLE: cbz w0, [[ELSE_LABEL:LBB[0-9_]+]] ; Setup of the varags. @@ -473,8 +473,8 @@ ; DISABLE: [[IFEND_LABEL]]: ; %if.end ; ; Epilogue code. -; CHECK: add sp, sp, #48 -; CHECK-NEXT: ldp [[CSR1]], [[CSR2]], [sp], #16 +; CHECK: ldp [[CSR1]], [[CSR2]], [sp, #48] +; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret ; ; ENABLE: [[ELSE_LABEL]]: ; %if.else Index: test/CodeGen/AArch64/fastcc.ll =================================================================== --- test/CodeGen/AArch64/fastcc.ll +++ test/CodeGen/AArch64/fastcc.ll @@ -7,13 +7,15 @@ define fastcc void @func_stack0() { ; CHECK-LABEL: func_stack0: -; CHECK: mov x29, sp -; CHECK: str w{{[0-9]+}}, [sp, #-32]! +; CHECK: sub sp, sp, #48 +; CHECK: add x29, sp, #32 +; CHECK: str w{{[0-9]+}}, [sp] ; CHECK-TAIL-LABEL: func_stack0: -; CHECK-TAIL: stp x29, x30, [sp, #-16]! -; CHECK-TAIL-NEXT: mov x29, sp -; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]! +; CHECK-TAIL: sub sp, sp, #48 +; CHECK-TAIL-NEXT: stp x29, x30, [sp, #32] +; CHECK-TAIL-NEXT: add x29, sp, #32 +; CHECK-TAIL: str w{{[0-9]+}}, [sp] call fastcc void @func_stack8([8 x i32] undef, i32 42) @@ -42,27 +44,29 @@ ; CHECK-TAIL-NOT: sub sp, sp ret void -; CHECK: add sp, sp, #32 -; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK: ldp x29, x30, [sp, #32] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret -; CHECK-TAIL: add sp, sp, #32 -; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16 +; CHECK-TAIL: ldp x29, x30, [sp, #32] +; CHECK-TAIL-NEXT: add sp, sp, #48 ; CHECK-TAIL-NEXT: ret } define fastcc void @func_stack8([8 x i32], i32 %stacked) { ; CHECK-LABEL: func_stack8: -; CHECK: stp x29, x30, [sp, #-16]! -; CHECK: mov x29, sp -; CHECK: str w{{[0-9]+}}, [sp, #-32]! +; CHECK: sub sp, sp, #48 +; CHECK: stp x29, x30, [sp, #32] +; CHECK: add x29, sp, #32 +; CHECK: str w{{[0-9]+}}, [sp] ; CHECK-TAIL-LABEL: func_stack8: -; CHECK-TAIL: stp x29, x30, [sp, #-16]! -; CHECK-TAIL: mov x29, sp -; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]! +; CHECK-TAIL: sub sp, sp, #48 +; CHECK-TAIL: stp x29, x30, [sp, #32] +; CHECK-TAIL: add x29, sp, #32 +; CHECK-TAIL: str w{{[0-9]+}}, [sp] call fastcc void @func_stack8([8 x i32] undef, i32 42) @@ -91,23 +95,22 @@ ; CHECK-TAIL-NOT: sub sp, sp ret void -; CHECK: add sp, sp, #32 -; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ldp x29, x30, [sp, #32] +; CHECK: add sp, sp, #48 ; CHECK-NEXT: ret -; CHECK-TAIL: add sp, sp, #32 -; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16 -; CHECK-TAIL-NEXT: add sp, sp, #16 +; CHECK-TAIL: ldp x29, x30, [sp, #32] +; CHECK-TAIL-NEXT: add sp, sp, #64 ; CHECK-TAIL-NEXT: ret } define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) { ; CHECK-LABEL: func_stack32: -; CHECK: mov x29, sp +; CHECK: add x29, sp, #32 ; CHECK-TAIL-LABEL: func_stack32: -; CHECK-TAIL: mov x29, sp +; CHECK-TAIL: add x29, sp, #32 call fastcc void @func_stack8([8 x i32] undef, i32 42) @@ -136,13 +139,12 @@ ; CHECK-TAIL-NOT: sub sp, sp ret void -; CHECK: add sp, sp, #32 -; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK: ldp x29, x30, [sp, #32] +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret -; CHECK-TAIL: add sp, sp, #32 -; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16 -; CHECK-TAIL-NEXT: add sp, sp, #32 +; CHECK-TAIL: ldp x29, x30, [sp, #32] +; CHECK-TAIL-NEXT: add sp, sp, #80 ; CHECK-TAIL-NEXT: ret } @@ -180,22 +182,21 @@ ; Check that arg stack pop is done after callee-save restore when no frame pointer is used. define fastcc void @func_stack32_leaf_local([8 x i32], i128 %stacked0, i128 %stacked1) { ; CHECK-LABEL: func_stack32_leaf_local: -; CHECK: str x20, [sp, #-16]! -; CHECK-NEXT: sub sp, sp, #16 +; CHECK: sub sp, sp, #32 +; CHECK-NEXT: str x20, [sp, #16] ; CHECK: nop ; CHECK-NEXT: //NO_APP -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: ldr x20, [sp], #16 +; CHECK-NEXT: ldr x20, [sp, #16] +; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret ; CHECK-TAIL-LABEL: func_stack32_leaf_local: -; CHECK-TAIL: str x20, [sp, #-16]! -; CHECK-TAIL-NEXT: sub sp, sp, #16 +; CHECK-TAIL: sub sp, sp, #32 +; CHECK-TAIL-NEXT: str x20, [sp, #16] ; CHECK-TAIL: nop ; CHECK-TAIL-NEXT: //NO_APP -; CHECK-TAIL-NEXT: add sp, sp, #16 -; CHECK-TAIL-NEXT: ldr x20, [sp], #16 -; CHECK-TAIL-NEXT: add sp, sp, #32 +; CHECK-TAIL-NEXT: ldr x20, [sp, #16] +; CHECK-TAIL-NEXT: add sp, sp, #64 ; CHECK-TAIL-NEXT: ret ; CHECK-TAIL-RZ-LABEL: func_stack32_leaf_local: Index: test/CodeGen/AArch64/func-calls.ll =================================================================== --- test/CodeGen/AArch64/func-calls.ll +++ test/CodeGen/AArch64/func-calls.ll @@ -89,11 +89,11 @@ ; that varstruct is passed on the stack. Rather dependent on how a ; memcpy gets created, but the following works for now. -; CHECK-DAG: str {{q[0-9]+}}, [sp, #-16] +; CHECK-DAG: str {{q[0-9]+}}, [sp] ; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0 ; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b -; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp, #-16]! +; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp] ; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0 ; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]] Index: test/CodeGen/AArch64/tailcall-implicit-sret.ll =================================================================== --- test/CodeGen/AArch64/tailcall-implicit-sret.ll +++ test/CodeGen/AArch64/tailcall-implicit-sret.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s +; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -disable-post-ra -asm-verbose=false | FileCheck %s ; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks. target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" Index: test/DebugInfo/AArch64/prologue_end.ll =================================================================== --- test/DebugInfo/AArch64/prologue_end.ll +++ test/DebugInfo/AArch64/prologue_end.ll @@ -9,9 +9,9 @@ define void @prologue_end_test() nounwind uwtable !dbg !4 { ; CHECK: prologue_end_test: ; CHECK: .cfi_startproc - ; CHECK: stp x29, x30 - ; CHECK: mov x29, sp ; CHECK: sub sp, sp + ; CHECK: stp x29, x30 + ; CHECK: add x29, sp ; CHECK: .loc 1 3 3 prologue_end ; CHECK: bl _func ; CHECK: bl _func