diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h @@ -34,6 +34,9 @@ void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; + void processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS) const override; + bool hasReservedCallFrame(const MachineFunction &MF) const override; MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, @@ -45,7 +48,8 @@ bool hasFP(const MachineFunction &MF) const override; bool hasBP(const MachineFunction &MF) const; - uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const; + uint64_t getFirstSPAdjustAmount(const MachineFunction &MF, + bool IsPrologue = false) const; private: void determineFrameLayout(MachineFunction &MF) const; diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -118,6 +118,26 @@ MFI.setStackSize(FrameSize); } +void LoongArchFrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, RegScavenger *RS) const { + const LoongArchRegisterInfo *RI = STI.getRegisterInfo(); + const TargetRegisterClass &RC = LoongArch::GPRRegClass; + MachineFrameInfo &MFI = MF.getFrameInfo(); + + // estimateStackSize has been observed to under-estimate the final stack + // size, so give ourselves wiggle-room by checking for stack size + // representable an 11-bit signed field rather than 12-bits. + if (isInt<11>(MFI.estimateStackSize(MF))) + return; + + // Create an emergency spill slot. + int FI = + MFI.CreateStackObject(RI->getSpillSize(RC), RI->getSpillAlign(RC), false); + RS->addScavengingFrameIndex(FI); + LLVM_DEBUG(dbgs() << "Allocated FI(" << FI + << ") as the emergency spill slot.\n"); +} + void LoongArchFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -125,6 +145,7 @@ const LoongArchRegisterInfo *RI = STI.getRegisterInfo(); const LoongArchInstrInfo *TII = STI.getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.begin(); + bool IsLA64 = STI.is64Bit(); Register SPReg = LoongArch::R3; Register FPReg = LoongArch::R22; @@ -144,19 +165,22 @@ if (StackSize == 0 && !MFI.adjustsStack()) return; - uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); + uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF, true); + uint64_t SecondSPAdjustAmount = RealStackSize - FirstSPAdjustAmount; // Split the SP adjustment to reduce the offsets of callee saved spill. if (FirstSPAdjustAmount) StackSize = FirstSPAdjustAmount; // Adjust stack. adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup); - // Emit ".cfi_def_cfa_offset StackSize". - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); + if (FirstSPAdjustAmount != 2048 || SecondSPAdjustAmount == 0) { + // Emit ".cfi_def_cfa_offset StackSize". + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + } const auto &CSI = MFI.getCalleeSavedInfo(); @@ -193,14 +217,25 @@ } // Emit the second SP adjustment after saving callee saved registers. - if (FirstSPAdjustAmount) { - uint64_t SecondSPAdjustAmount = RealStackSize - FirstSPAdjustAmount; - assert(SecondSPAdjustAmount > 0 && - "SecondSPAdjustAmount should be greater than zero"); - adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount, - MachineInstr::FrameSetup); + if (FirstSPAdjustAmount && SecondSPAdjustAmount) { + if (hasFP(MF)) { + assert(SecondSPAdjustAmount > 0 && + "SecondSPAdjustAmount should be greater than zero"); + adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount, + MachineInstr::FrameSetup); + } else { + // FIXME: RegScavenger will place the spill instruction before the + // prologue if a VReg is created in the prologue. This will pollute the + // caller's stack data. Therefore, until there is better way, we just use + // the `addi.w/d` instruction for stack adjustment to ensure that VReg + // will not be created. + for (int Val = SecondSPAdjustAmount; Val > 0; Val -= 2048) + BuildMI(MBB, MBBI, DL, + TII->get(IsLA64 ? LoongArch::ADDI_D : LoongArch::ADDI_W), SPReg) + .addReg(SPReg) + .addImm(Val < 2048 ? -Val : -2048) + .setMIFlag(MachineInstr::FrameSetup); - if (!hasFP(MF)) { // If we are using a frame-pointer, and thus emitted ".cfi_def_cfa fp, 0", // don't emit an sp-based .cfi_def_cfa_offset // Emit ".cfi_def_cfa_offset RealStackSize" @@ -219,14 +254,12 @@ Register VR = MF.getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass); BuildMI(MBB, MBBI, DL, - TII->get(STI.is64Bit() ? LoongArch::SRLI_D : LoongArch::SRLI_W), - VR) + TII->get(IsLA64 ? LoongArch::SRLI_D : LoongArch::SRLI_W), VR) .addReg(SPReg) .addImm(ShiftAmount) .setMIFlag(MachineInstr::FrameSetup); BuildMI(MBB, MBBI, DL, - TII->get(STI.is64Bit() ? LoongArch::SLLI_D : LoongArch::SLLI_W), - SPReg) + TII->get(IsLA64 ? LoongArch::SLLI_D : LoongArch::SLLI_W), SPReg) .addReg(VR) .addImm(ShiftAmount) .setMIFlag(MachineInstr::FrameSetup); @@ -295,20 +328,27 @@ // st.d $ra, $sp, 2024 // st.d $fp, $sp, 2016 // addi.d $sp, $sp, -16 -uint64_t LoongArchFrameLowering::getFirstSPAdjustAmount( - const MachineFunction &MF) const { +uint64_t +LoongArchFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF, + bool IsPrologue) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const std::vector &CSI = MFI.getCalleeSavedInfo(); // Return the FirstSPAdjustAmount if the StackSize can not fit in a signed // 12-bit and there exists a callee-saved register needing to be pushed. - if (!isInt<12>(MFI.getStackSize()) && (CSI.size() > 0)) { + if (!isInt<12>(MFI.getStackSize())) { // FirstSPAdjustAmount is chosen as (2048 - StackAlign) because 2048 will // cause sp = sp + 2048 in the epilogue to be split into multiple // instructions. Offsets smaller than 2048 can fit in a single load/store // instruction, and we have to stick with the stack alignment. // So (2048 - StackAlign) will satisfy the stack alignment. - return 2048 - getStackAlign().value(); + // + // FIXME: This place may seem odd. When using multiple ADDI instructions to + // adjust the stack in Prologue, and there are no callee-saved registers, we + // can take advantage of the logic of split sp ajustment to reduce code + // changes. + return CSI.size() > 0 ? 2048 - getStackAlign().value() + : (IsPrologue ? 2048 : 0); } return 0; } diff --git a/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll b/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 -O0 < %s | FileCheck %s + +@var = external global i32 + +define void @func() { +; CHECK-LABEL: func: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -2048 +; CHECK-NEXT: addi.d $sp, $sp, -2048 +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 4112 +; CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(var) +; CHECK-NEXT: ld.d $a1, $a0, %got_pc_lo12(var) +; CHECK-NEXT: ld.w $t8, $a1, 0 +; CHECK-NEXT: ld.w $t7, $a1, 0 +; CHECK-NEXT: ld.w $t6, $a1, 0 +; CHECK-NEXT: ld.w $t5, $a1, 0 +; CHECK-NEXT: ld.w $t4, $a1, 0 +; CHECK-NEXT: ld.w $t3, $a1, 0 +; CHECK-NEXT: ld.w $t2, $a1, 0 +; CHECK-NEXT: ld.w $t1, $a1, 0 +; CHECK-NEXT: ld.w $t0, $a1, 0 +; CHECK-NEXT: ld.w $a7, $a1, 0 +; CHECK-NEXT: ld.w $a6, $a1, 0 +; CHECK-NEXT: ld.w $a5, $a1, 0 +; CHECK-NEXT: ld.w $a4, $a1, 0 +; CHECK-NEXT: ld.w $a3, $a1, 0 +; CHECK-NEXT: ld.w $a2, $a1, 0 +; CHECK-NEXT: ld.w $a0, $a1, 0 +; CHECK-NEXT: st.d $fp, $sp, 0 +; CHECK-NEXT: lu12i.w $fp, 1 +; CHECK-NEXT: ori $fp, $fp, 12 +; CHECK-NEXT: add.d $fp, $sp, $fp +; CHECK-NEXT: st.w $t8, $fp, 0 +; CHECK-NEXT: ld.d $fp, $sp, 0 +; CHECK-NEXT: st.w $t8, $a1, 0 +; CHECK-NEXT: st.w $t7, $a1, 0 +; CHECK-NEXT: st.w $t6, $a1, 0 +; CHECK-NEXT: st.w $t5, $a1, 0 +; CHECK-NEXT: st.w $t4, $a1, 0 +; CHECK-NEXT: st.w $t3, $a1, 0 +; CHECK-NEXT: st.w $t2, $a1, 0 +; CHECK-NEXT: st.w $t1, $a1, 0 +; CHECK-NEXT: st.w $t0, $a1, 0 +; CHECK-NEXT: st.w $a7, $a1, 0 +; CHECK-NEXT: st.w $a6, $a1, 0 +; CHECK-NEXT: st.w $a5, $a1, 0 +; CHECK-NEXT: st.w $a4, $a1, 0 +; CHECK-NEXT: st.w $a3, $a1, 0 +; CHECK-NEXT: st.w $a2, $a1, 0 +; CHECK-NEXT: st.w $a0, $a1, 0 +; CHECK-NEXT: lu12i.w $a0, 1 +; CHECK-NEXT: ori $a0, $a0, 16 +; CHECK-NEXT: add.d $sp, $sp, $a0 +; CHECK-NEXT: ret + %space = alloca i32, align 4 + %stackspace = alloca[1024 x i32], align 4 + + ;; Load values to increase register pressure. + %v0 = load volatile i32, ptr @var + %v1 = load volatile i32, ptr @var + %v2 = load volatile i32, ptr @var + %v3 = load volatile i32, ptr @var + %v4 = load volatile i32, ptr @var + %v5 = load volatile i32, ptr @var + %v6 = load volatile i32, ptr @var + %v7 = load volatile i32, ptr @var + %v8 = load volatile i32, ptr @var + %v9 = load volatile i32, ptr @var + %v10 = load volatile i32, ptr @var + %v11 = load volatile i32, ptr @var + %v12 = load volatile i32, ptr @var + %v13 = load volatile i32, ptr @var + %v14 = load volatile i32, ptr @var + %v15 = load volatile i32, ptr @var + + ;; Computing a stack-relative values needs an additional register. + ;; We should get an emergency spill/reload for this. + store volatile i32 %v0, ptr %space + + ;; store values so they are used. + store volatile i32 %v0, ptr @var + store volatile i32 %v1, ptr @var + store volatile i32 %v2, ptr @var + store volatile i32 %v3, ptr @var + store volatile i32 %v4, ptr @var + store volatile i32 %v5, ptr @var + store volatile i32 %v6, ptr @var + store volatile i32 %v7, ptr @var + store volatile i32 %v8, ptr @var + store volatile i32 %v9, ptr @var + store volatile i32 %v10, ptr @var + store volatile i32 %v11, ptr @var + store volatile i32 %v12, ptr @var + store volatile i32 %v13, ptr @var + store volatile i32 %v14, ptr @var + store volatile i32 %v15, ptr @var + + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/frame.ll b/llvm/test/CodeGen/LoongArch/frame.ll --- a/llvm/test/CodeGen/LoongArch/frame.ll +++ b/llvm/test/CodeGen/LoongArch/frame.ll @@ -27,6 +27,7 @@ ret i32 0 } +;; Note: will create an emergency spill slot, if (!isInt<11>(StackSize)). ;; Should involve only one SP-adjusting addi per adjustment. define void @test_large_frame_size_2032() { ; CHECK-LABEL: test_large_frame_size_2032: @@ -35,7 +36,7 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 2032 ; CHECK-NEXT: addi.d $sp, $sp, 2032 ; CHECK-NEXT: ret - %1 = alloca i8, i32 2032 + %1 = alloca i8, i32 2016 ; + 16(emergency slot) = 2032 ret void } @@ -49,7 +50,7 @@ ; CHECK-NEXT: addi.d $sp, $sp, 2032 ; CHECK-NEXT: addi.d $sp, $sp, 16 ; CHECK-NEXT: ret - %1 = alloca i8, i32 2048 + %1 = alloca i8, i32 2032 ; + 16(emergency slot) = 2048 ret void } @@ -63,21 +64,35 @@ ; CHECK-NEXT: addi.d $sp, $sp, 2032 ; CHECK-NEXT: addi.d $sp, $sp, 32 ; CHECK-NEXT: ret - %1 = alloca i8, i32 2064 + %1 = alloca i8, i32 2048 ; + 16(emergency slot) = 2064 ret void } +;; NOTE: Due to the problem with the emegency spill slot, the scratch register +;; will not be used when the fp is eliminated. To make this test valid, add the +;; attribute "frame-pointer=all". + ;; SP should be adjusted with help of a scratch register. -define void @test_large_frame_size_1234576() { +define void @test_large_frame_size_1234576() "frame-pointer"="all" { ; CHECK-LABEL: test_large_frame_size_1234576: ; CHECK: # %bb.0: -; CHECK-NEXT: lu12i.w $a0, 301 -; CHECK-NEXT: ori $a0, $a0, 1680 +; CHECK-NEXT: addi.d $sp, $sp, -2032 +; CHECK-NEXT: .cfi_def_cfa_offset 2032 +; CHECK-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset 1, -8 +; CHECK-NEXT: .cfi_offset 22, -16 +; CHECK-NEXT: addi.d $fp, $sp, 2032 +; CHECK-NEXT: .cfi_def_cfa 22, 0 +; CHECK-NEXT: lu12i.w $a0, 300 +; CHECK-NEXT: ori $a0, $a0, 3760 ; CHECK-NEXT: sub.d $sp, $sp, $a0 -; CHECK-NEXT: .cfi_def_cfa_offset 1234576 -; CHECK-NEXT: lu12i.w $a0, 301 -; CHECK-NEXT: ori $a0, $a0, 1680 +; CHECK-NEXT: lu12i.w $a0, 300 +; CHECK-NEXT: ori $a0, $a0, 3760 ; CHECK-NEXT: add.d $sp, $sp, $a0 +; CHECK-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 2032 ; CHECK-NEXT: ret %1 = alloca i8, i32 1234567 ret void diff --git a/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll b/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll --- a/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll +++ b/llvm/test/CodeGen/LoongArch/split-sp-adjust.ll @@ -24,19 +24,20 @@ } ;; The stack size is 2032 and the SP adjustment will not be split. +;; 2016 + 8(RA) + 8(emergency spill slot) = 2032 define i32 @NoSplitSP() nounwind { ; CHECK-LABEL: NoSplitSP: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addi.d $sp, $sp, -2032 ; CHECK-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: addi.d $a0, $sp, 8 ; CHECK-NEXT: bl %plt(foo) ; CHECK-NEXT: move $a0, $zero ; CHECK-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload ; CHECK-NEXT: addi.d $sp, $sp, 2032 ; CHECK-NEXT: ret entry: - %xx = alloca [2024 x i8], align 1 + %xx = alloca [2016 x i8], align 1 %0 = getelementptr inbounds [2024 x i8], ptr %xx, i32 0, i32 0 %call = call i32 @foo(ptr nonnull %0) ret i32 0 diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment.ll b/llvm/test/CodeGen/LoongArch/stack-realignment.ll --- a/llvm/test/CodeGen/LoongArch/stack-realignment.ll +++ b/llvm/test/CodeGen/LoongArch/stack-realignment.ll @@ -305,42 +305,42 @@ define void @caller512() { ; LA32-LABEL: caller512: ; LA32: # %bb.0: -; LA32-NEXT: addi.w $sp, $sp, -512 -; LA32-NEXT: .cfi_def_cfa_offset 512 -; LA32-NEXT: st.w $ra, $sp, 508 # 4-byte Folded Spill -; LA32-NEXT: st.w $fp, $sp, 504 # 4-byte Folded Spill +; LA32-NEXT: addi.w $sp, $sp, -1024 +; LA32-NEXT: .cfi_def_cfa_offset 1024 +; LA32-NEXT: st.w $ra, $sp, 1020 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 1016 # 4-byte Folded Spill ; LA32-NEXT: .cfi_offset 1, -4 ; LA32-NEXT: .cfi_offset 22, -8 -; LA32-NEXT: addi.w $fp, $sp, 512 +; LA32-NEXT: addi.w $fp, $sp, 1024 ; LA32-NEXT: .cfi_def_cfa 22, 0 ; LA32-NEXT: srli.w $a0, $sp, 9 ; LA32-NEXT: slli.w $sp, $a0, 9 -; LA32-NEXT: addi.w $a0, $sp, 0 +; LA32-NEXT: addi.w $a0, $sp, 512 ; LA32-NEXT: bl %plt(callee) -; LA32-NEXT: addi.w $sp, $fp, -512 -; LA32-NEXT: ld.w $fp, $sp, 504 # 4-byte Folded Reload -; LA32-NEXT: ld.w $ra, $sp, 508 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 512 +; LA32-NEXT: addi.w $sp, $fp, -1024 +; LA32-NEXT: ld.w $fp, $sp, 1016 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 1020 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 1024 ; LA32-NEXT: ret ; ; LA64-LABEL: caller512: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $sp, $sp, -512 -; LA64-NEXT: .cfi_def_cfa_offset 512 -; LA64-NEXT: st.d $ra, $sp, 504 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 496 # 8-byte Folded Spill +; LA64-NEXT: addi.d $sp, $sp, -1024 +; LA64-NEXT: .cfi_def_cfa_offset 1024 +; LA64-NEXT: st.d $ra, $sp, 1016 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 1008 # 8-byte Folded Spill ; LA64-NEXT: .cfi_offset 1, -8 ; LA64-NEXT: .cfi_offset 22, -16 -; LA64-NEXT: addi.d $fp, $sp, 512 +; LA64-NEXT: addi.d $fp, $sp, 1024 ; LA64-NEXT: .cfi_def_cfa 22, 0 ; LA64-NEXT: srli.d $a0, $sp, 9 ; LA64-NEXT: slli.d $sp, $a0, 9 -; LA64-NEXT: addi.d $a0, $sp, 0 +; LA64-NEXT: addi.d $a0, $sp, 512 ; LA64-NEXT: bl %plt(callee) -; LA64-NEXT: addi.d $sp, $fp, -512 -; LA64-NEXT: ld.d $fp, $sp, 496 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 504 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 512 +; LA64-NEXT: addi.d $sp, $fp, -1024 +; LA64-NEXT: ld.d $fp, $sp, 1008 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 1016 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 1024 ; LA64-NEXT: ret %1 = alloca i8, align 512 call void @callee(i8* %1) @@ -379,42 +379,46 @@ define void @caller1024() { ; LA32-LABEL: caller1024: ; LA32: # %bb.0: -; LA32-NEXT: addi.w $sp, $sp, -1024 -; LA32-NEXT: .cfi_def_cfa_offset 1024 -; LA32-NEXT: st.w $ra, $sp, 1020 # 4-byte Folded Spill -; LA32-NEXT: st.w $fp, $sp, 1016 # 4-byte Folded Spill +; LA32-NEXT: addi.w $sp, $sp, -2032 +; LA32-NEXT: .cfi_def_cfa_offset 2032 +; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill ; LA32-NEXT: .cfi_offset 1, -4 ; LA32-NEXT: .cfi_offset 22, -8 -; LA32-NEXT: addi.w $fp, $sp, 1024 +; LA32-NEXT: addi.w $fp, $sp, 2032 ; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: srli.w $a0, $sp, 10 ; LA32-NEXT: slli.w $sp, $a0, 10 -; LA32-NEXT: addi.w $a0, $sp, 0 +; LA32-NEXT: addi.w $a0, $sp, 1024 ; LA32-NEXT: bl %plt(callee) -; LA32-NEXT: addi.w $sp, $fp, -1024 -; LA32-NEXT: ld.w $fp, $sp, 1016 # 4-byte Folded Reload -; LA32-NEXT: ld.w $ra, $sp, 1020 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 1024 +; LA32-NEXT: addi.w $sp, $fp, -2048 +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 2032 ; LA32-NEXT: ret ; ; LA64-LABEL: caller1024: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $sp, $sp, -1024 -; LA64-NEXT: .cfi_def_cfa_offset 1024 -; LA64-NEXT: st.d $ra, $sp, 1016 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 1008 # 8-byte Folded Spill +; LA64-NEXT: addi.d $sp, $sp, -2032 +; LA64-NEXT: .cfi_def_cfa_offset 2032 +; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill ; LA64-NEXT: .cfi_offset 1, -8 ; LA64-NEXT: .cfi_offset 22, -16 -; LA64-NEXT: addi.d $fp, $sp, 1024 +; LA64-NEXT: addi.d $fp, $sp, 2032 ; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: srli.d $a0, $sp, 10 ; LA64-NEXT: slli.d $sp, $a0, 10 -; LA64-NEXT: addi.d $a0, $sp, 0 +; LA64-NEXT: addi.d $a0, $sp, 1024 ; LA64-NEXT: bl %plt(callee) -; LA64-NEXT: addi.d $sp, $fp, -1024 -; LA64-NEXT: ld.d $fp, $sp, 1008 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 1016 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 1024 +; LA64-NEXT: addi.d $sp, $fp, -2048 +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 2032 ; LA64-NEXT: ret %1 = alloca i8, align 1024 call void @callee(i8* %1) @@ -461,13 +465,17 @@ ; LA32-NEXT: .cfi_offset 22, -8 ; LA32-NEXT: addi.w $fp, $sp, 2032 ; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: addi.w $sp, $sp, -2048 ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: srli.w $a0, $sp, 11 ; LA32-NEXT: slli.w $sp, $a0, 11 -; LA32-NEXT: addi.w $a0, $sp, 0 +; LA32-NEXT: ori $a0, $zero, 2048 +; LA32-NEXT: add.w $a0, $sp, $a0 ; LA32-NEXT: bl %plt(callee) -; LA32-NEXT: addi.w $sp, $fp, -2048 -; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: lu12i.w $a0, 1 +; LA32-NEXT: sub.w $sp, $fp, $a0 +; LA32-NEXT: addi.w $sp, $sp, 2032 +; LA32-NEXT: addi.w $sp, $sp, 32 ; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload ; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 2032 @@ -483,13 +491,17 @@ ; LA64-NEXT: .cfi_offset 22, -16 ; LA64-NEXT: addi.d $fp, $sp, 2032 ; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: addi.d $sp, $sp, -2048 ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: srli.d $a0, $sp, 11 ; LA64-NEXT: slli.d $sp, $a0, 11 -; LA64-NEXT: addi.d $a0, $sp, 0 +; LA64-NEXT: ori $a0, $zero, 2048 +; LA64-NEXT: add.d $a0, $sp, $a0 ; LA64-NEXT: bl %plt(callee) -; LA64-NEXT: addi.d $sp, $fp, -2048 -; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: lu12i.w $a0, 1 +; LA64-NEXT: sub.d $sp, $fp, $a0 +; LA64-NEXT: addi.d $sp, $sp, 2032 +; LA64-NEXT: addi.d $sp, $sp, 32 ; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload ; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 2032 @@ -539,16 +551,19 @@ ; LA32-NEXT: .cfi_offset 22, -8 ; LA32-NEXT: addi.w $fp, $sp, 2032 ; LA32-NEXT: .cfi_def_cfa 22, 0 -; LA32-NEXT: addi.w $sp, $sp, -2048 -; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: lu12i.w $a0, 1 +; LA32-NEXT: ori $a0, $a0, 2064 +; LA32-NEXT: sub.w $sp, $sp, $a0 ; LA32-NEXT: srli.w $a0, $sp, 12 ; LA32-NEXT: slli.w $sp, $a0, 12 -; LA32-NEXT: addi.w $a0, $sp, 0 -; LA32-NEXT: bl %plt(callee) ; LA32-NEXT: lu12i.w $a0, 1 +; LA32-NEXT: add.w $a0, $sp, $a0 +; LA32-NEXT: bl %plt(callee) +; LA32-NEXT: lu12i.w $a0, 2 ; LA32-NEXT: sub.w $sp, $fp, $a0 -; LA32-NEXT: addi.w $sp, $sp, 2032 -; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: lu12i.w $a0, 1 +; LA32-NEXT: ori $a0, $a0, 2064 +; LA32-NEXT: add.w $sp, $sp, $a0 ; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload ; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 2032 @@ -564,16 +579,19 @@ ; LA64-NEXT: .cfi_offset 22, -16 ; LA64-NEXT: addi.d $fp, $sp, 2032 ; LA64-NEXT: .cfi_def_cfa 22, 0 -; LA64-NEXT: addi.d $sp, $sp, -2048 -; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: lu12i.w $a0, 1 +; LA64-NEXT: ori $a0, $a0, 2064 +; LA64-NEXT: sub.d $sp, $sp, $a0 ; LA64-NEXT: srli.d $a0, $sp, 12 ; LA64-NEXT: slli.d $sp, $a0, 12 -; LA64-NEXT: addi.d $a0, $sp, 0 -; LA64-NEXT: bl %plt(callee) ; LA64-NEXT: lu12i.w $a0, 1 +; LA64-NEXT: add.d $a0, $sp, $a0 +; LA64-NEXT: bl %plt(callee) +; LA64-NEXT: lu12i.w $a0, 2 ; LA64-NEXT: sub.d $sp, $fp, $a0 -; LA64-NEXT: addi.d $sp, $sp, 2032 -; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: lu12i.w $a0, 1 +; LA64-NEXT: ori $a0, $a0, 2064 +; LA64-NEXT: add.d $sp, $sp, $a0 ; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload ; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 2032