Index: lib/Target/AArch64/AArch64FrameLowering.h =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.h +++ lib/Target/AArch64/AArch64FrameLowering.h @@ -20,6 +20,11 @@ class AArch64FrameLowering : public TargetFrameLowering { public: + enum StackID { + ST_Default = 0, + ST_SVEVector = 1 + }; + explicit AArch64FrameLowering() : TargetFrameLowering(StackGrowsDown, 16, 0, 16, true /*StackRealignable*/) {} Index: lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.cpp +++ lib/Target/AArch64/AArch64FrameLowering.cpp @@ -184,6 +184,11 @@ return DefaultSafeSPDisplacement; } +static uint64_t getSVEStackSize(const MachineFunction &MF) { + const AArch64FunctionInfo *AFI = MF.getInfo(); + return AFI->getStackSizeSVE(); +} + bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -196,7 +201,8 @@ const AArch64FunctionInfo *AFI = MF.getInfo(); unsigned NumBytes = AFI->getLocalStackSize(); - return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128); + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 || + getSVEStackSize(MF)); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -839,6 +845,13 @@ if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; + // Allocate the SVE area. + int64_t SVEStackSize = getSVEStackSize(MF); + if (SVEStackSize) + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, + {-SVEStackSize, MVT::nxv1i8}, TII, + MachineInstr::FrameSetup); + // getStackSize() includes all the locals in its size calculation. We don't // include these locals when computing the stack size of a funclet, as they // are allocated in the parent's stack frame and accessed via the frame @@ -849,6 +862,7 @@ : (int)MFI.getStackSize(); if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); + assert(!SVEStackSize && "Must have stack frame with SVE"); // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); if (!NumBytes) @@ -1264,6 +1278,8 @@ : MFI.getStackSize(); AArch64FunctionInfo *AFI = MF.getInfo(); + MachineBasicBlock::iterator FirstTerminator = MBB.getFirstTerminator(); + // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. if (MF.getFunction().getCallingConv() == CallingConv::GHC) @@ -1332,10 +1348,12 @@ if (MF.hasEHFunclets()) AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + int64_t SVEStackSize = getSVEStackSize(MF); + // Assume we can't combine the last pop with the sp restore. if (!CombineSPBump && PrologueSaveSize != 0) { - MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator()); + MachineBasicBlock::iterator Pop = std::prev(FirstTerminator); while (AArch64InstrInfo::isSEHInstruction(*Pop)) Pop = std::prev(Pop); // Converting the last ldp to a post-index ldp is valid only if the last @@ -1357,7 +1375,7 @@ // Move past the restores of the callee-saved registers. // If we plan on combining the sp bump of the local stack size and the callee // save stack size, we might need to adjust the CSR save and restore offsets. - MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); + MachineBasicBlock::iterator LastPopI = FirstTerminator; MachineBasicBlock::iterator Begin = MBB.begin(); while (LastPopI != Begin) { --LastPopI; @@ -1375,13 +1393,17 @@ // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { - emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + emitFrameOffset(MBB, FirstTerminator, DL, AArch64::SP, AArch64::SP, {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); if (NeedsWinCFI) - BuildMI(MBB, MBB.getFirstTerminator(), DL, + BuildMI(MBB, FirstTerminator, DL, TII->get(AArch64::SEH_EpilogEnd)) .setMIFlag(MachineInstr::FrameDestroy); + + emitFrameOffset(MBB, FirstTerminator, DL, AArch64::SP, AArch64::SP, + {SVEStackSize, MVT::nxv1i8}, TII, + MachineInstr::FrameDestroy, false, false); return; } @@ -1406,14 +1428,14 @@ // If we're done after this, make sure to help the load store optimizer. if (Done) - adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI); + adaptForLdStOpt(MBB, FirstTerminator, LastPopI); emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, {StackRestoreBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); - if (Done) { + if (!SVEStackSize && Done) { if (NeedsWinCFI) - BuildMI(MBB, MBB.getFirstTerminator(), DL, + BuildMI(MBB, FirstTerminator, DL, TII->get(AArch64::SEH_EpilogEnd)) .setMIFlag(MachineInstr::FrameDestroy); return; @@ -1442,7 +1464,7 @@ // Find an insertion point for the first ldp so that it goes before the // shadow call stack epilog instruction. This ensures that the restore of // lr from x18 is placed after the restore from sp. - auto FirstSPPopI = MBB.getFirstTerminator(); + auto FirstSPPopI = FirstTerminator; while (FirstSPPopI != Begin) { auto Prev = std::prev(FirstSPPopI); if (Prev->getOpcode() != AArch64::LDRXpre || @@ -1458,8 +1480,12 @@ MachineInstr::FrameDestroy, false, NeedsWinCFI); } if (NeedsWinCFI) - BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) + BuildMI(MBB, FirstTerminator, DL, TII->get(AArch64::SEH_EpilogEnd)) .setMIFlag(MachineInstr::FrameDestroy); + + emitFrameOffset(MBB, FirstTerminator, DL, AArch64::SP, AArch64::SP, + {SVEStackSize, MVT::nxv1i8}, TII, + MachineInstr::FrameDestroy, false, false); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -1517,6 +1543,9 @@ bool isCSR = !isFixed && MFI.getObjectOffset(FI) >= -((int)AFI->getCalleeSavedStackSize()); + uint64_t SVEStackSize = getSVEStackSize(MF); + assert(!SVEStackSize && "Accessing SVE frame indices not yet supported"); + // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't // reliable as a base). Make sure useFPForScavengingIndex() does the @@ -2075,8 +2104,19 @@ << ' ' << printReg(Reg, RegInfo); dbgs() << "\n";); + bool HasSVEStackObjects = [&MFI]() { + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) + if (MFI.getStackID(I) == ST_SVEVector && + MFI.getObjectOffset(I) < 0) + return true; + // Note: We don't take allocatable stack objects into + // account yet, because allocation for those is not yet + // implemented. + return false; + }(); + // If any callee-saved registers are used, the frame cannot be eliminated. - bool CanEliminateFrame = SavedRegs.count() == 0; + bool CanEliminateFrame = (SavedRegs.count() == 0) && !HasSVEStackObjects; // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. @@ -2139,12 +2179,34 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); + + assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && + "Upwards growing stack unsupported"); + + + // Process all fixed stack SVE objects. + int64_t Offset = 0; + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) { + unsigned StackID = MFI.getStackID(I); + if (StackID == ST_SVEVector) { + int64_t FixedOffset = -MFI.getObjectOffset(I); + if (FixedOffset > Offset) Offset = FixedOffset; + } + } + + unsigned MaxAlign = getStackAlignment(); + uint64_t SVEStackSize = alignTo(Offset, MaxAlign); + + AArch64FunctionInfo *AFI = MF.getInfo(); + AFI->setStackSizeSVE(SVEStackSize); + AFI->setMaxAlignSVE(MaxAlign); + // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. if (!MF.hasEHFunclets()) return; const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - MachineFrameInfo &MFI = MF.getFrameInfo(); WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); MachineBasicBlock &MBB = MF.front(); Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2977,6 +2977,16 @@ MaxEncoding = 0xfff; ShiftSize = 12; break; + case AArch64::ADDVL_XXI: + case AArch64::ADDPL_XXI: + MaxEncoding = 31; + ShiftSize = 0; + if (Offset < 0) { + MaxEncoding = 32; + Sign = -1; + Offset = -Offset; + } + break; default: llvm_unreachable("Unsupported opcode"); } @@ -3038,8 +3048,8 @@ StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV, bool NeedsWinCFI) { - int64_t Bytes; - Offset.getForFrameOffset(Bytes); + int64_t Bytes, PLSized, VLSized; + Offset.getForFrameOffset(Bytes, PLSized, VLSized); // First emit non-scalable frame offsets, or a simple 'mov'. if (Bytes || (Offset.isZero() && SrcReg != DestReg)) { @@ -3054,6 +3064,23 @@ Flag, NeedsWinCFI); SrcReg = DestReg; } + + assert(!(SetNZCV && (PLSized || VLSized)) && + "SetNZCV not supported with SVE vectors"); + assert(!(NeedsWinCFI && (PLSized || VLSized)) && + "WinCFI not supported with SVE vectors"); + + if (VLSized) { + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, VLSized, + AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI); + SrcReg = DestReg; + } + + if (PLSized) { + assert(DestReg != AArch64::SP && "Unaligned access to SP"); + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, PLSized, + AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI); + } } MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( Index: lib/Target/AArch64/AArch64MachineFunctionInfo.h =================================================================== --- lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -91,6 +91,15 @@ /// other stack allocations. bool CalleeSaveStackHasFreeSpace = false; + /// SVE stack size (for predicates and data vectors) are maintained here + /// rather than in FrameInfo, as the placement and Stack IDs are target + /// specific. + uint64_t StackSizeSVE = 0; + + /// The SVE region gets its own alignment, separate from the regular area + /// on the stack. This means we may align the SVE region separately. + unsigned MaxAlignSVE = 16; + /// Has a value when it is known whether or not the function uses a /// redzone, and no value otherwise. /// Initialized during frame lowering, unless the function has the noredzone @@ -120,6 +129,12 @@ ArgumentStackToRestore = bytes; } + void setStackSizeSVE(uint64_t S) { StackSizeSVE = S; } + uint64_t getStackSizeSVE() const { return StackSizeSVE; } + + void setMaxAlignSVE(unsigned A) { MaxAlignSVE = A; } + unsigned getMaxAlignSVE() const { return MaxAlignSVE; } + bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } Index: lib/Target/AArch64/AArch64StackOffset.h =================================================================== --- lib/Target/AArch64/AArch64StackOffset.h +++ lib/Target/AArch64/AArch64StackOffset.h @@ -33,28 +33,35 @@ class StackOffset { int64_t Bytes; + int64_t ScalableBytes; public: using Part = std::pair; - StackOffset() : Bytes(0) {} + StackOffset() : Bytes(0), ScalableBytes(0) {} StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() { - assert(!MVT(T).isScalableVector() && "Scalable types not supported"); *this += Part(Offset, T); } - StackOffset(const StackOffset &Other) : Bytes(Other.Bytes) {} + StackOffset(const StackOffset &Other) + : Bytes(Other.Bytes), + ScalableBytes(Other.ScalableBytes) {} StackOffset &operator+=(const StackOffset::Part &Other) { assert(Other.second.getSizeInBits() % 8 == 0 && "Offset type is not a multiple of bytes"); - Bytes += Other.first * (Other.second.getSizeInBits() / 8); + int64_t OffsetInBytes = Other.first * (Other.second.getSizeInBits() / 8); + if (Other.second.isScalableVector()) + ScalableBytes += OffsetInBytes; + else + Bytes += OffsetInBytes; return *this; } StackOffset &operator+=(const StackOffset &Other) { Bytes += Other.Bytes; + ScalableBytes += Other.ScalableBytes; return *this; } @@ -66,6 +73,7 @@ StackOffset &operator-=(const StackOffset &Other) { Bytes -= Other.Bytes; + ScalableBytes -= Other.ScalableBytes; return *this; } @@ -75,18 +83,38 @@ return Res; } + /// Returns the scalable part of the offset in bytes. + int64_t getScalableBytes() const { return ScalableBytes; } + /// Returns the non-scalable part of the offset in bytes. int64_t getBytes() const { return Bytes; } - /// Returns the offset in parts to which this frame offset can be - /// decomposed for the purpose of describing a frame offset. - /// For non-scalable offsets this is simple its byte size. - void getForFrameOffset(int64_t &ByteSized) const { + void getForFrameOffset(int64_t &ByteSized, int64_t &PLSized, + int64_t &VLSized) const { + assert(isValid() && "Invalid frame offset"); + ByteSized = Bytes; + VLSized = 0; + PLSized = ScalableBytes / 2; + // This method is used to get the offsets to adjust the frame offset. + // If the function requires ADDPL to be used and needs more than two ADDPL + // instructions, part of the offset is folded into VLSized so that it uses + // ADDVL for part of it, reducing the number of ADDPL instructions. + if (PLSized % 8 == 0 || PLSized < -64 || PLSized > 62) { + VLSized = PLSized / 8; + PLSized -= VLSized * 8; + } } /// Returns whether the offset is known zero. - bool isZero() const { return !Bytes; } + bool isZero() const { return !Bytes && !ScalableBytes; } + + bool isValid() const { + // The smallest scalable element supported by scaled SVE addressing + // modes are predicates, which are 2 scalable bytes in size. So the scalable + // byte offset must always be a multiple of 2. + return ScalableBytes % 2 == 0; + } }; } // end namespace llvm Index: test/CodeGen/AArch64/framelayout-sve.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/framelayout-sve.mir @@ -0,0 +1,53 @@ +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s +# +# Test allocation and deallocation of SVE objects on the stack, +# as well as using a combination of scalable and non-scalable +# offsets to access the SVE on the stack. +# +# SVE objects are allocated above all non-scalable objects +# (callee saves, spill and stack objects), e.g.: +# +# +-------------+ +# | stack arg | +# +-------------+ <- SP before call +# | SVE objs | +# +-------------+ +# | Frame record| (if available) +# |-------------| <- FP (if available) +# | Callee Saves| +# | : | +# | Stack objs | +# | : | +# +-------------+ <- SP after call and frame-setup +# +... +# +----------+ <- SP before call +# | %stack.2 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16bytes) +# +----------+ +# | %stack.0 | // not scalable +# | %stack.1 | // not scalable +# +----------+ <- SP after call +name: test_allocate_sve +frameInfo: + maxAlignment: 16 +fixedStack: + - { id: 0, stack-id: 0, size: 8, alignment: 8, offset: -8 } + - { id: 1, stack-id: 0, size: 8, alignment: 8, offset: -16 } + - { id: 2, stack-id: 42, size: 8, alignment: 8, offset: -24 } + - { id: 3, stack-id: 1, size: 18, alignment: 2, offset: -18 } +# CHECK: name: test_allocate_sve +# CHECK: stackSize: 16 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 + +# CHECK: $sp = frame-destroy ADDXri $sp, 16, 0 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 +# CHECK-NEXT: RET_ReallyLR +body: | + bb.0.entry: + liveins: $z0, $z1 + RET_ReallyLR +---