Index: lib/Target/AArch64/AArch64FrameLowering.h =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.h +++ lib/Target/AArch64/AArch64FrameLowering.h @@ -20,6 +20,11 @@ class AArch64FrameLowering : public TargetFrameLowering { public: + enum StackID { + ST_Default = 0, + ST_SVEVector = 1 + }; + explicit AArch64FrameLowering() : TargetFrameLowering(StackGrowsDown, 16, 0, 16, true /*StackRealignable*/) {} Index: lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.cpp +++ lib/Target/AArch64/AArch64FrameLowering.cpp @@ -184,6 +184,11 @@ return DefaultSafeSPDisplacement; } +static uint64_t getSVEStackSize(const MachineFunction &MF) { + const AArch64FunctionInfo *AFI = MF.getInfo(); + return AFI->getStackSizeSVE(); +} + bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -196,7 +201,8 @@ const AArch64FunctionInfo *AFI = MF.getInfo(); unsigned NumBytes = AFI->getLocalStackSize(); - return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128); + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 || + getSVEStackSize(MF)); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -839,6 +845,13 @@ if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; + // Allocate the SVE area. + int64_t SVEStackSize = getSVEStackSize(MF); + if (SVEStackSize) + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, + {-SVEStackSize, MVT::nxv1i8}, TII, + MachineInstr::FrameSetup); + // getStackSize() includes all the locals in its size calculation. We don't // include these locals when computing the stack size of a funclet, as they // are allocated in the parent's stack frame and accessed via the frame @@ -849,6 +862,7 @@ : (int)MFI.getStackSize(); if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); + assert(!SVEStackSize && "Must have stack frame with SVE"); // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); if (!NumBytes) @@ -1264,6 +1278,8 @@ : MFI.getStackSize(); AArch64FunctionInfo *AFI = MF.getInfo(); + MachineBasicBlock::iterator FirstTerminator = MBB.getFirstTerminator(); + // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. if (MF.getFunction().getCallingConv() == CallingConv::GHC) @@ -1332,10 +1348,12 @@ if (MF.hasEHFunclets()) AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + int64_t SVEStackSize = getSVEStackSize(MF); + // Assume we can't combine the last pop with the sp restore. if (!CombineSPBump && PrologueSaveSize != 0) { - MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator()); + MachineBasicBlock::iterator Pop = std::prev(FirstTerminator); while (AArch64InstrInfo::isSEHInstruction(*Pop)) Pop = std::prev(Pop); // Converting the last ldp to a post-index ldp is valid only if the last @@ -1357,7 +1375,7 @@ // Move past the restores of the callee-saved registers. // If we plan on combining the sp bump of the local stack size and the callee // save stack size, we might need to adjust the CSR save and restore offsets. - MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); + MachineBasicBlock::iterator LastPopI = FirstTerminator; MachineBasicBlock::iterator Begin = MBB.begin(); while (LastPopI != Begin) { --LastPopI; @@ -1375,13 +1393,17 @@ // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { - emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + emitFrameOffset(MBB, FirstTerminator, DL, AArch64::SP, AArch64::SP, {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); if (NeedsWinCFI) - BuildMI(MBB, MBB.getFirstTerminator(), DL, + BuildMI(MBB, FirstTerminator, DL, TII->get(AArch64::SEH_EpilogEnd)) .setMIFlag(MachineInstr::FrameDestroy); + + emitFrameOffset(MBB, FirstTerminator, DL, AArch64::SP, AArch64::SP, + {SVEStackSize, MVT::nxv1i8}, TII, + MachineInstr::FrameDestroy, false, false); return; } @@ -1406,14 +1428,14 @@ // If we're done after this, make sure to help the load store optimizer. if (Done) - adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI); + adaptForLdStOpt(MBB, FirstTerminator, LastPopI); emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, {StackRestoreBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); - if (Done) { + if (!SVEStackSize && Done) { if (NeedsWinCFI) - BuildMI(MBB, MBB.getFirstTerminator(), DL, + BuildMI(MBB, FirstTerminator, DL, TII->get(AArch64::SEH_EpilogEnd)) .setMIFlag(MachineInstr::FrameDestroy); return; @@ -1442,7 +1464,7 @@ // Find an insertion point for the first ldp so that it goes before the // shadow call stack epilog instruction. This ensures that the restore of // lr from x18 is placed after the restore from sp. - auto FirstSPPopI = MBB.getFirstTerminator(); + auto FirstSPPopI = FirstTerminator; while (FirstSPPopI != Begin) { auto Prev = std::prev(FirstSPPopI); if (Prev->getOpcode() != AArch64::LDRXpre || @@ -1458,8 +1480,12 @@ MachineInstr::FrameDestroy, false, NeedsWinCFI); } if (NeedsWinCFI) - BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) + BuildMI(MBB, FirstTerminator, DL, TII->get(AArch64::SEH_EpilogEnd)) .setMIFlag(MachineInstr::FrameDestroy); + + emitFrameOffset(MBB, FirstTerminator, DL, AArch64::SP, AArch64::SP, + {SVEStackSize, MVT::nxv1i8}, TII, + MachineInstr::FrameDestroy, false, false); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -1517,6 +1543,9 @@ bool isCSR = !isFixed && MFI.getObjectOffset(FI) >= -((int)AFI->getCalleeSavedStackSize()); + uint64_t SVEStackSize = getSVEStackSize(MF); + assert(!SVEStackSize && "Accessing SVE frame indices not yet supported"); + // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't // reliable as a base). Make sure useFPForScavengingIndex() does the @@ -2075,8 +2104,19 @@ << ' ' << printReg(Reg, RegInfo); dbgs() << "\n";); + bool HasSVEStackObjects = [&MFI]() { + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) + if (MFI.getStackID(I) == ST_SVEVector && + MFI.getObjectOffset(I) < 0) + return true; + // Note: We don't take allocatable stack objects into + // account yet, because allocation for those is not yet + // implemented. + return false; + }(); + // If any callee-saved registers are used, the frame cannot be eliminated. - bool CanEliminateFrame = SavedRegs.count() == 0; + bool CanEliminateFrame = (SavedRegs.count() == 0) && !HasSVEStackObjects; // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. @@ -2139,12 +2179,34 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); + + assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && + "Upwards growing stack unsupported"); + + + // Process all fixed stack SVE objects. + int64_t Offset = 0; + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) { + unsigned StackID = MFI.getStackID(I); + if (StackID == ST_SVEVector) { + int64_t FixedOffset = -MFI.getObjectOffset(I); + if (FixedOffset > Offset) Offset = FixedOffset; + } + } + + unsigned MaxAlign = getStackAlignment(); + uint64_t SVEStackSize = alignTo(Offset, MaxAlign); + + AArch64FunctionInfo *AFI = MF.getInfo(); + AFI->setStackSizeSVE(SVEStackSize); + AFI->setMaxAlignSVE(MaxAlign); + // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. if (!MF.hasEHFunclets()) return; const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - MachineFrameInfo &MFI = MF.getFrameInfo(); WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); MachineBasicBlock &MBB = MF.front(); Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2974,6 +2974,16 @@ MaxEncoding = 0xfff; ShiftSize = 12; break; + case AArch64::ADDVL_XXI: + case AArch64::ADDPL_XXI: + MaxEncoding = 31; + ShiftSize = 0; + if (Offset < 0) { + MaxEncoding = 32; + Sign = -1; + Offset = -Offset; + } + break; default: llvm_unreachable("Unsupported opcode"); } @@ -3039,8 +3049,8 @@ StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV, bool NeedsWinCFI) { - int64_t Bytes; - Offset.getForFrameOffset(Bytes); + int64_t Bytes, PLSized, VLSized; + Offset.getForFrameOffset(Bytes, PLSized, VLSized); // First emit non-scalable frame offsets, or a simple 'mov'. if (Bytes || (Offset.isZero() && SrcReg != DestReg)) { @@ -3055,6 +3065,23 @@ Flag, NeedsWinCFI); SrcReg = DestReg; } + + assert(!(SetNZCV && (PLSized || VLSized)) && + "SetNZCV not supported with SVE vectors"); + assert(!(NeedsWinCFI && (PLSized || VLSized)) && + "WinCFI not supported with SVE vectors"); + + if (VLSized) { + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, VLSized, + AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI); + SrcReg = DestReg; + } + + if (PLSized) { + assert(DestReg != AArch64::SP && "Unaligned access to SP"); + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, PLSized, + AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI); + } } MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( Index: lib/Target/AArch64/AArch64MachineFunctionInfo.h =================================================================== --- lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -91,6 +91,15 @@ /// other stack allocations. bool CalleeSaveStackHasFreeSpace = false; + /// SVE stack size (for predicates and data vectors) are maintained here + /// rather than in FrameInfo, as the placement and Stack IDs are target + /// specific. + uint64_t StackSizeSVE = 0; + + /// The SVE region gets its own alignment, separate from the regular area + /// on the stack. This means we may align the SVE region separately. + unsigned MaxAlignSVE = 16; + /// Has a value when it is known whether or not the function uses a /// redzone, and no value otherwise. /// Initialized during frame lowering, unless the function has the noredzone @@ -120,6 +129,12 @@ ArgumentStackToRestore = bytes; } + void setStackSizeSVE(uint64_t S) { StackSizeSVE = S; } + uint64_t getStackSizeSVE() const { return StackSizeSVE; } + + void setMaxAlignSVE(unsigned A) { MaxAlignSVE = A; } + unsigned getMaxAlignSVE() const { return MaxAlignSVE; } + bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } Index: lib/Target/AArch64/AArch64StackOffset.h =================================================================== --- lib/Target/AArch64/AArch64StackOffset.h +++ lib/Target/AArch64/AArch64StackOffset.h @@ -35,28 +35,35 @@ /// vector and a 64bit GPR. class StackOffset { int64_t Bytes; + int64_t ScalableBytes; public: using Part = std::pair; - StackOffset() : Bytes(0) {} + StackOffset() : Bytes(0), ScalableBytes(0) {} StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() { - assert(!MVT(T).isScalableVector() && "Scalable types not supported"); *this += Part(Offset, T); } - StackOffset(const StackOffset &Other) : Bytes(Other.Bytes) {} + StackOffset(const StackOffset &Other) + : Bytes(Other.Bytes), + ScalableBytes(Other.ScalableBytes) {} StackOffset &operator+=(const StackOffset::Part &Other) { assert(Other.second.getSizeInBits() % 8 == 0 && "Offset type is not a multiple of bytes"); - Bytes += Other.first * (Other.second.getSizeInBits() / 8); + int64_t OffsetInBytes = Other.first * (Other.second.getSizeInBits() / 8); + if (Other.second.isScalableVector()) + ScalableBytes += OffsetInBytes; + else + Bytes += OffsetInBytes; return *this; } StackOffset &operator+=(const StackOffset &Other) { Bytes += Other.Bytes; + ScalableBytes += Other.ScalableBytes; return *this; } @@ -68,6 +75,7 @@ StackOffset &operator-=(const StackOffset &Other) { Bytes -= Other.Bytes; + ScalableBytes -= Other.ScalableBytes; return *this; } @@ -77,18 +85,41 @@ return Res; } + /// Returns the scalable part of the offset in bytes. + int64_t getScalableBytes() const { return ScalableBytes; } + /// Returns the non-scalable part of the offset in bytes. int64_t getBytes() const { return Bytes; } /// Returns the offset in parts to which this frame offset can be /// decomposed for the purpose of describing a frame offset. /// For non-scalable offsets this is simply its byte size. - void getForFrameOffset(int64_t &ByteSized) const { + void getForFrameOffset(int64_t &ByteSized, int64_t &PLSized, + int64_t &VLSized) const { + assert(isValid() && "Invalid frame offset"); + ByteSized = Bytes; + VLSized = 0; + PLSized = ScalableBytes / 2; + // This method is used to get the offsets to adjust the frame offset. + // If the function requires ADDPL to be used and needs more than two ADDPL + // instructions, part of the offset is folded into VLSized so that it uses + // ADDVL for part of it, reducing the number of ADDPL instructions. + if (PLSized % 8 == 0 || PLSized < -64 || PLSized > 62) { + VLSized = PLSized / 8; + PLSized -= VLSized * 8; + } } /// Returns whether the offset is known zero. - bool isZero() const { return !Bytes; } + bool isZero() const { return !Bytes && !ScalableBytes; } + + bool isValid() const { + // The smallest scalable element supported by scaled SVE addressing + // modes are predicates, which are 2 scalable bytes in size. So the scalable + // byte offset must always be a multiple of 2. + return ScalableBytes % 2 == 0; + } }; } // end namespace llvm Index: test/CodeGen/AArch64/framelayout-sve.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/framelayout-sve.mir @@ -0,0 +1,53 @@ +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s +# +# Test allocation and deallocation of SVE objects on the stack, +# as well as using a combination of scalable and non-scalable +# offsets to access the SVE on the stack. +# +# SVE objects are allocated above all non-scalable objects +# (callee saves, spill and stack objects), e.g.: +# +# +-------------+ +# | stack arg | +# +-------------+ <- SP before call +# | SVE objs | +# +-------------+ +# | Frame record| (if available) +# |-------------| <- FP (if available) +# | Callee Saves| +# | : | +# | Stack objs | +# | : | +# +-------------+ <- SP after call and frame-setup +# +... +# +----------+ <- SP before call +# | %stack.2 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16bytes) +# +----------+ +# | %stack.0 | // not scalable +# | %stack.1 | // not scalable +# +----------+ <- SP after call +name: test_allocate_sve +frameInfo: + maxAlignment: 16 +fixedStack: + - { id: 0, stack-id: 0, size: 8, alignment: 8, offset: -8 } + - { id: 1, stack-id: 0, size: 8, alignment: 8, offset: -16 } + - { id: 2, stack-id: 42, size: 8, alignment: 8, offset: -24 } + - { id: 3, stack-id: 1, size: 18, alignment: 2, offset: -18 } +# CHECK: name: test_allocate_sve +# CHECK: stackSize: 16 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 + +# CHECK: $sp = frame-destroy ADDXri $sp, 16, 0 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 +# CHECK-NEXT: RET_ReallyLR +body: | + bb.0.entry: + liveins: $z0, $z1 + RET_ReallyLR +--- Index: unittests/Target/AArch64/TestStackOffset.cpp =================================================================== --- unittests/Target/AArch64/TestStackOffset.cpp +++ unittests/Target/AArch64/TestStackOffset.cpp @@ -20,6 +20,15 @@ StackOffset C(2, MVT::v4i64); EXPECT_EQ(64, C.getBytes()); + + StackOffset D(2, MVT::nxv4i64); + EXPECT_EQ(64, D.getScalableBytes()); + + StackOffset E(2, MVT::v4i64); + EXPECT_EQ(0, E.getScalableBytes()); + + StackOffset F(2, MVT::nxv4i64); + EXPECT_EQ(0, F.getBytes()); } TEST(StackOffset, Add) { @@ -31,6 +40,11 @@ StackOffset D(1, MVT::i32); D += A; EXPECT_EQ(12, D.getBytes()); + + StackOffset E(1, MVT::nxv1i32); + StackOffset F = C + E; + EXPECT_EQ(12, F.getBytes()); + EXPECT_EQ(4, F.getScalableBytes()); } TEST(StackOffset, Sub) { @@ -42,6 +56,12 @@ StackOffset D(1, MVT::i64); D -= A; EXPECT_EQ(0, D.getBytes()); + + C += StackOffset(2, MVT::nxv1i32); + StackOffset E = StackOffset(1, MVT::nxv1i32); + StackOffset F = C - E; + EXPECT_EQ(4, F.getBytes()); + EXPECT_EQ(4, F.getScalableBytes()); } TEST(StackOffset, isZero) { @@ -49,12 +69,57 @@ StackOffset B(0, MVT::i32); EXPECT_TRUE(A.isZero()); EXPECT_TRUE((A+B).isZero()); + + StackOffset C(0, MVT::nxv1i32); + EXPECT_TRUE((A+C).isZero()); + + StackOffset D(1, MVT::nxv1i32); + EXPECT_FALSE((A+D).isZero()); +} + +TEST(StackOffset, isValid) { + EXPECT_FALSE(StackOffset(1, MVT::nxv8i1).isValid()); + EXPECT_TRUE(StackOffset(2, MVT::nxv8i1).isValid()); + + EXPECT_DEATH(StackOffset(1, MVT::i1), "Offset type is not a multiple of bytes"); + EXPECT_DEATH(StackOffset(1, MVT::nxv1i1), "Offset type is not a multiple of bytes"); } TEST(StackOffset, getForFrameOffset) { StackOffset A(1, MVT::i64); StackOffset B(1, MVT::i32); - int64_t ByteSized; - (A+B).getForFrameOffset(ByteSized); + StackOffset C(1, MVT::nxv4i32); + + // If all offsets can be materialized with only ADDVL, + // make sure PLSized is 0. + int64_t ByteSized, VLSized, PLSized; + (A+B+C).getForFrameOffset(ByteSized, PLSized, VLSized); EXPECT_EQ(12, ByteSized); + EXPECT_EQ(1, VLSized); + EXPECT_EQ(0, PLSized); + + // If we need an ADDPL to materialize the offset, and the number of scalable + // bytes fits the ADDPL immediate, fold the scalable bytes to fit in PLSized. + StackOffset D(1, MVT::nxv16i1); + (C+D).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(0, VLSized); + EXPECT_EQ(9, PLSized); + + StackOffset E(4, MVT::nxv4i32); + StackOffset F(1, MVT::nxv16i1); + (E+F).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(0, VLSized); + EXPECT_EQ(33, PLSized); + + // If the offset requires an ADDPL instruction to materialize, and would + // require more than two instructions, decompose it into both + // ADDVL (n x 16 bytes) and ADDPL (n x 2 bytes) instructions. + StackOffset G(8, MVT::nxv4i32); + StackOffset H(1, MVT::nxv16i1); + (G+H).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(8, VLSized); + EXPECT_EQ(1, PLSized); }