diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -53,21 +53,55 @@ MachineInstr::MIFlag Flag) const { const LoongArchInstrInfo *TII = STI.getInstrInfo(); bool IsLA64 = STI.is64Bit(); + unsigned Addi = IsLA64 ? LoongArch::ADDI_D : LoongArch::ADDI_W; if (DestReg == SrcReg && Val == 0) return; if (isInt<12>(Val)) { // addi.w/d $DstReg, $SrcReg, Val - BuildMI(MBB, MBBI, DL, - TII->get(IsLA64 ? LoongArch::ADDI_D : LoongArch::ADDI_W), DestReg) + BuildMI(MBB, MBBI, DL, TII->get(Addi), DestReg) .addReg(SrcReg) .addImm(Val) .setMIFlag(Flag); return; } - report_fatal_error("adjustReg cannot yet handle adjustments >12 bits"); + // Try to split the offset across two ADDIs. We need to keep the stack pointer + // aligned after each ADDI. We need to determine the maximum value we can put + // in each ADDI. In the negative direction, we can use -2048 which is always + // sufficiently aligned. In the positive direction, we need to find the + // largest 12-bit immediate that is aligned. Exclude -4096 since it can be + // created with LU12I.W. + assert(getStackAlign().value() < 2048 && "Stack alignment too large"); + int64_t MaxPosAdjStep = 2048 - getStackAlign().value(); + if (Val > -4096 && Val <= (2 * MaxPosAdjStep)) { + int64_t FirstAdj = Val < 0 ? -2048 : MaxPosAdjStep; + Val -= FirstAdj; + BuildMI(MBB, MBBI, DL, TII->get(Addi), DestReg) + .addReg(SrcReg) + .addImm(FirstAdj) + .setMIFlag(Flag); + BuildMI(MBB, MBBI, DL, TII->get(Addi), DestReg) + .addReg(DestReg, RegState::Kill) + .addImm(Val) + .setMIFlag(Flag); + return; + } + + unsigned Opc = IsLA64 ? LoongArch::ADD_D : LoongArch::ADD_W; + if (Val < 0) { + Val = -Val; + Opc = IsLA64 ? LoongArch::SUB_D : LoongArch::SUB_W; + } + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + Register ScratchReg = MRI.createVirtualRegister(&LoongArch::GPRRegClass); + TII->movImm(MBB, MBBI, DL, ScratchReg, Val, Flag); + BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) + .addReg(SrcReg) + .addReg(ScratchReg, RegState::Kill) + .setMIFlag(Flag); } // Determine the size of the frame and maximum call frame size. diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -41,6 +41,11 @@ int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; + // Materializes the given integer Val into DstReg. + void movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register DstReg, uint64_t Val, + MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const; + unsigned getInstSizeInBytes(const MachineInstr &MI) const override; MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override; @@ -60,6 +65,9 @@ bool reverseBranchCondition(SmallVectorImpl &Cond) const override; + +protected: + const LoongArchSubtarget &STI; }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -13,6 +13,7 @@ #include "LoongArchInstrInfo.h" #include "LoongArch.h" #include "LoongArchMachineFunctionInfo.h" +#include "MCTargetDesc/LoongArchMatInt.h" using namespace llvm; @@ -21,7 +22,8 @@ LoongArchInstrInfo::LoongArchInstrInfo(LoongArchSubtarget &STI) : LoongArchGenInstrInfo(LoongArch::ADJCALLSTACKDOWN, - LoongArch::ADJCALLSTACKUP) {} + LoongArch::ADJCALLSTACKUP), + STI(STI) {} void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -114,6 +116,43 @@ .addMemOperand(MMO); } +void LoongArchInstrInfo::movImm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register DstReg, + uint64_t Val, MachineInstr::MIFlag Flag) const { + Register SrcReg = LoongArch::R0; + + if (!STI.is64Bit() && !isInt<32>(Val)) + report_fatal_error("Should only materialize 32-bit constants for LA32"); + + auto Seq = LoongArchMatInt::generateInstSeq(Val); + assert(!Seq.empty()); + + for (auto &Inst : Seq) { + switch (Inst.Opc) { + case LoongArch::LU12I_W: + BuildMI(MBB, MBBI, DL, get(Inst.Opc), DstReg) + .addImm(Inst.Imm) + .setMIFlag(Flag); + break; + case LoongArch::ADDI_W: + case LoongArch::ORI: + case LoongArch::LU32I_D: // "rj" is needed due to InstrInfo pattern + case LoongArch::LU52I_D: + BuildMI(MBB, MBBI, DL, get(Inst.Opc), DstReg) + .addReg(SrcReg, RegState::Kill) + .addImm(Inst.Imm) + .setMIFlag(Flag); + break; + default: + assert(false && "Unknown insn emitted by LoongArchMatInt"); + } + + // Only the first instruction has $zero as its source. + SrcReg = DstReg; + } +} + unsigned LoongArchInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return MI.getDesc().getSize(); } diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h --- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h @@ -43,6 +43,14 @@ RegScavenger *RS = nullptr) const override; Register getFrameRegister(const MachineFunction &MF) const override; + + bool requiresRegisterScavenging(const MachineFunction &MF) const override { + return true; + } + + bool requiresFrameIndexScavenging(const MachineFunction &MF) const override { + return true; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp --- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp @@ -13,7 +13,9 @@ #include "LoongArchRegisterInfo.h" #include "LoongArch.h" +#include "LoongArchInstrInfo.h" #include "LoongArchSubtarget.h" +#include "MCTargetDesc/LoongArchMCTargetDesc.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -112,7 +114,11 @@ assert(SPAdj == 0 && "Unexpected non-zero SPAdj value"); MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MI.getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const LoongArchSubtarget &STI = MF.getSubtarget(); + const LoongArchInstrInfo *TII = STI.getInstrInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); DebugLoc DL = MI.getDebugLoc(); @@ -122,12 +128,32 @@ TFI->getFrameIndexReference(MF, FrameIndex, FrameReg) + StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm()); - // Offsets must be encodable with a 12-bit immediate field. + bool FrameRegIsKill = false; + if (!isInt<12>(Offset.getFixed())) { - report_fatal_error("Frame offsets outside of the signed 12-bit range is " - "not supported currently"); + unsigned Addi = STI.is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W; + unsigned Add = STI.is64Bit() ? LoongArch::ADD_D : LoongArch::ADD_W; + + // The offset won't fit in an immediate, so use a scratch register instead. + // Modify Offset and FrameReg appropriately. + Register ScratchReg = MRI.createVirtualRegister(&LoongArch::GPRRegClass); + TII->movImm(MBB, II, DL, ScratchReg, Offset.getFixed()); + if (MI.getOpcode() == Addi) { + BuildMI(MBB, II, DL, TII->get(Add), MI.getOperand(0).getReg()) + .addReg(FrameReg) + .addReg(ScratchReg, RegState::Kill); + MI.eraseFromParent(); + return; + } + BuildMI(MBB, II, DL, TII->get(Add), ScratchReg) + .addReg(FrameReg) + .addReg(ScratchReg, RegState::Kill); + Offset = StackOffset::getFixed(0); + FrameReg = ScratchReg; + FrameRegIsKill = true; } - MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false); + MI.getOperand(FIOperandNum) + .ChangeToRegister(FrameReg, false, false, FrameRegIsKill); MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed()); } diff --git a/llvm/test/CodeGen/LoongArch/frame.ll b/llvm/test/CodeGen/LoongArch/frame.ll --- a/llvm/test/CodeGen/LoongArch/frame.ll +++ b/llvm/test/CodeGen/LoongArch/frame.ll @@ -2,6 +2,17 @@ ; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s %struct.key_t = type { i32, [16 x i8] } +%size2016 = type [2016 x i8] +%size2032 = type [2032 x i8] +%size2048 = type [2048 x i8] +%size1234567 = type [1234567 x i8] + +declare void @llvm.memset.p0i8.i64(ptr, i8, i64, i1) +declare void @test1(ptr) +declare void @test2016(ptr byval(%size2016)) +declare void @test2032(ptr byval(%size2032)) +declare void @test2048(ptr byval(%size2048)) +declare void @test1234567(ptr byval(%size1234567)) define i32 @test() nounwind { ; CHECK-LABEL: test: @@ -24,6 +35,134 @@ ret i32 0 } -declare void @llvm.memset.p0i8.i64(ptr, i8, i64, i1) +;; Should involve only one SP-adjusting addi per adjustment. +;; TODO: The codegen quality of this and the other cases below should be +;; improved later; FP should be unnecessary in this case (cross-check RISCV +;; codegen to see how FP is not involved there). +define void @test_large_frame_size_2032(ptr byval(%size2016) %x) { +; CHECK-LABEL: test_large_frame_size_2032: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -2032 +; CHECK-NEXT: .cfi_def_cfa_offset 2032 +; CHECK-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset 1, -8 +; CHECK-NEXT: .cfi_offset 22, -16 +; CHECK-NEXT: move $a1, $a0 +; CHECK-NEXT: addi.d $fp, $sp, 0 +; CHECK-NEXT: ori $a2, $zero, 2016 +; CHECK-NEXT: move $a0, $fp +; CHECK-NEXT: bl %plt(memcpy) +; CHECK-NEXT: move $a0, $fp +; CHECK-NEXT: bl %plt(test2032) +; CHECK-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 2032 +; CHECK-NEXT: ret + call void @test2032(ptr byval(%size2016) %x) + ret void +} -declare void @test1(ptr) +;; Should involve two SP-adjusting addi's when adjusting SP up, but only one +;; when adjusting down. +define void @test_large_frame_size_2048(ptr byval(%size2032) %x) { +; CHECK-LABEL: test_large_frame_size_2048: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -2048 +; CHECK-NEXT: .cfi_def_cfa_offset 2048 +; CHECK-NEXT: st.d $ra, $sp, 2040 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 2032 # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset 1, -8 +; CHECK-NEXT: .cfi_offset 22, -16 +; CHECK-NEXT: move $a1, $a0 +; CHECK-NEXT: addi.d $fp, $sp, 0 +; CHECK-NEXT: ori $a2, $zero, 2032 +; CHECK-NEXT: move $a0, $fp +; CHECK-NEXT: bl %plt(memcpy) +; CHECK-NEXT: move $a0, $fp +; CHECK-NEXT: bl %plt(test2032) +; CHECK-NEXT: ld.d $fp, $sp, 2032 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 2040 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 2032 +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: ret + call void @test2032(ptr byval(%size2032) %x) + ret void +} + +;; Should involve two SP-adjusting addi's per adjustment. +define void @test_large_frame_size_2064(ptr byval(%size2048) %x) { +; CHECK-LABEL: test_large_frame_size_2064: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -2048 +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 2064 +; CHECK-NEXT: ori $a1, $zero, 2056 +; CHECK-NEXT: add.d $a1, $sp, $a1 +; CHECK-NEXT: st.d $ra, $a1, 0 # 8-byte Folded Spill +; CHECK-NEXT: ori $a1, $zero, 2048 +; CHECK-NEXT: add.d $a1, $sp, $a1 +; CHECK-NEXT: st.d $fp, $a1, 0 # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset 1, -8 +; CHECK-NEXT: .cfi_offset 22, -16 +; CHECK-NEXT: move $a1, $a0 +; CHECK-NEXT: addi.d $fp, $sp, 0 +; CHECK-NEXT: ori $a2, $zero, 2048 +; CHECK-NEXT: move $a0, $fp +; CHECK-NEXT: bl %plt(memcpy) +; CHECK-NEXT: move $a0, $fp +; CHECK-NEXT: bl %plt(test2048) +; CHECK-NEXT: ori $a0, $zero, 2048 +; CHECK-NEXT: add.d $a0, $sp, $a0 +; CHECK-NEXT: ld.d $fp, $a0, 0 # 8-byte Folded Reload +; CHECK-NEXT: ori $a0, $zero, 2056 +; CHECK-NEXT: add.d $a0, $sp, $a0 +; CHECK-NEXT: ld.d $ra, $a0, 0 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 2032 +; CHECK-NEXT: addi.d $sp, $sp, 32 +; CHECK-NEXT: ret + call void @test2048(ptr byval(%size2048) %x) + ret void +} + +;; SP should be adjusted with help of a scratch register. +define void @test_large_frame_size_1234592(ptr byval(%size1234567) %x) { +; CHECK-LABEL: test_large_frame_size_1234592: +; CHECK: # %bb.0: +; CHECK-NEXT: lu12i.w $a1, 301 +; CHECK-NEXT: ori $a1, $a1, 1696 +; CHECK-NEXT: sub.d $sp, $sp, $a1 +; CHECK-NEXT: .cfi_def_cfa_offset 1234592 +; CHECK-NEXT: lu12i.w $a1, 301 +; CHECK-NEXT: ori $a1, $a1, 1688 +; CHECK-NEXT: add.d $a1, $sp, $a1 +; CHECK-NEXT: st.d $ra, $a1, 0 # 8-byte Folded Spill +; CHECK-NEXT: lu12i.w $a1, 301 +; CHECK-NEXT: ori $a1, $a1, 1680 +; CHECK-NEXT: add.d $a1, $sp, $a1 +; CHECK-NEXT: st.d $fp, $a1, 0 # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset 1, -8 +; CHECK-NEXT: .cfi_offset 22, -16 +; CHECK-NEXT: move $a1, $a0 +; CHECK-NEXT: lu12i.w $a0, 301 +; CHECK-NEXT: ori $a2, $a0, 1671 +; CHECK-NEXT: addi.d $fp, $sp, 9 +; CHECK-NEXT: move $a0, $fp +; CHECK-NEXT: bl %plt(memcpy) +; CHECK-NEXT: move $a0, $fp +; CHECK-NEXT: bl %plt(test1234567) +; CHECK-NEXT: lu12i.w $a0, 301 +; CHECK-NEXT: ori $a0, $a0, 1680 +; CHECK-NEXT: add.d $a0, $sp, $a0 +; CHECK-NEXT: ld.d $fp, $a0, 0 # 8-byte Folded Reload +; CHECK-NEXT: lu12i.w $a0, 301 +; CHECK-NEXT: ori $a0, $a0, 1688 +; CHECK-NEXT: add.d $a0, $sp, $a0 +; CHECK-NEXT: ld.d $ra, $a0, 0 # 8-byte Folded Reload +; CHECK-NEXT: lu12i.w $a0, 301 +; CHECK-NEXT: ori $a0, $a0, 1696 +; CHECK-NEXT: add.d $sp, $sp, $a0 +; CHECK-NEXT: ret + call void @test1234567(ptr byval(%size1234567) %x) + ret void +}