diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2964,6 +2964,38 @@ return true; } +/// Returns true when SVE load/store instructions are used to access +/// fixed-length locals on the stack. +/// +/// This information can be used to determine whether an emergency spill-slot +/// may be required to cater for an explicit 'add' for materializing the SVE +/// load/store base-address when the address being loaded from cannot be +/// represented with + imm, when `imm` is not a multiple of VL-bytes. +bool useSVEAddressingModesForFixedLengthLocals(const MachineFunction &MF) { + const auto &STI = MF.getSubtarget(); + if (!STI.useSVEForFixedLengthVectors()) + return false; + + TypeSize Scale = TypeSize::Fixed(1); + unsigned Width; + int64_t MinOffset, MaxOffset; + const AArch64InstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + + for (auto MBBI = MF.begin(), MBBE = MF.end(); MBBI != MBBE; ++MBBI) { + for (auto I = MBBI->begin(), E = MBBI->end(); I != E; ++I) { + unsigned Opc = I->getOpcode(); + if (!TII->getMemOpInfo(Opc, Scale, Width, MinOffset, MaxOffset) || + !Scale.isScalable()) + continue; + if (any_of(I->operands(), + [](const MachineOperand &Op) { return Op.isFI(); })) + return true; + } + } + return false; +} + void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { @@ -3083,9 +3115,10 @@ if (FixedOff > CalleeStackUsed) CalleeStackUsed = FixedOff; } - // Conservatively always assume BigStack when there are SVE spills. - bool BigStack = SVEStackSize || (EstimatedStackSize + CSStackSize + - CalleeStackUsed) > EstimatedStackSizeLimit; + bool BigStack = SVEStackSize || + (EstimatedStackSize + CSStackSize + CalleeStackUsed) > + EstimatedStackSizeLimit || + useSVEAddressingModesForFixedLengthLocals(MF); if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) AFI->setHasStackFrame(true); diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir --- a/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve-scavengingslot.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog -mattr=+sve %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog -force-streaming-compatible-sve -mattr=+sve %s -o - | FileCheck %s --- # This test verifies that the emergency scavenging slot is located near the SP/BP. name: LateScavengingSlot @@ -26,3 +26,28 @@ liveins: $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $lr RET_ReallyLR implicit $x19, implicit $x20, implicit $x21, implicit $x22, implicit $x23, implicit $x24, implicit $x25, implicit $x26, implicit $x27, implicit $x28, implicit $lr ... +--- +# This test verifies that an emergency scavenging slot is allocated when using fixed-length SVE +name: RequiresScaveningSlotFixedLengthSVE +# CHECK-LABEL: name: RequiresScaveningSlotFixedLengthSVE +# CHECK: $sp = frame-setup SUBXri $sp, 32, 0 +# CHECK: frame-setup STPXi killed $fp, killed $lr, $sp, 2 +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 16, 0 +# CHECK: STRXui killed $x8, $sp, 1 +# CHECK-NEXT: $x8 = ADDXri $fp, 16, 0 +# CHECK-NEXT: ST1W_IMM $z0, $p0, killed $x8, 0 +# CHECK-NEXT: $x8 = LDRXui $sp, 1 +tracksRegLiveness: true +frameInfo: + isFrameAddressTaken: true +fixedStack: + - { id: 0, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0: + liveins: $z0, $p0 + ST1W_IMM $z0, $p0, %fixed-stack.0, 0 + B %bb.1 + bb.1: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16, $x17, $x18, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $lr + RET_ReallyLR implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18, implicit $x20, implicit $x21, implicit $x22, implicit $x23, implicit $x24, implicit $x25, implicit $x26, implicit $x27, implicit $x28, implicit $lr +... diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-ld2-alloca.ll @@ -8,16 +8,18 @@ define void @st1d_fixed(ptr %ptr) #0 { ; CHECK-LABEL: st1d_fixed: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #144 -; CHECK-NEXT: stp x30, x19, [sp, #128] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #160 +; CHECK-NEXT: stp x30, x19, [sp, #144] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: str x29, [sp, #128] // 8-byte Folded Spill ; CHECK-NEXT: bl def ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [sp] +; CHECK-NEXT: ldr x29, [sp, #128] // 8-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #128] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #144 +; CHECK-NEXT: ldp x30, x19, [sp, #144] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #160 ; CHECK-NEXT: ret %alloc = alloca [16 x double] call void @def(ptr %alloc) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -8,23 +8,25 @@ define void @alloc_v4i8(ptr %st_ptr) #0 { ; CHECK-LABEL: alloc_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: add x0, sp, #12 +; CHECK-NEXT: add x0, sp, #28 +; CHECK-NEXT: str x29, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: bl def -; CHECK-NEXT: add x8, sp, #12 +; CHECK-NEXT: add x8, sp, #28 ; CHECK-NEXT: ptrue p0.b, vl2 ; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x8] ; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr x29, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: mov z2.b, z0.b[1] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: stp w8, w9, [sp] -; CHECK-NEXT: ldr d0, [sp] +; CHECK-NEXT: stp w8, w9, [sp, #8] +; CHECK-NEXT: ldr d0, [sp, #8] ; CHECK-NEXT: st1b { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret %alloc = alloca [4 x i8] call void @def(ptr %alloc) @@ -41,6 +43,7 @@ ; CHECK-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: str x29, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: bl def ; CHECK-NEXT: add x8, sp, #24 ; CHECK-NEXT: ptrue p0.b, vl3 @@ -52,16 +55,17 @@ ; CHECK-NEXT: mov z0.b, z1.b[1] ; CHECK-NEXT: fmov w9, s2 ; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: strh w8, [sp, #8] +; CHECK-NEXT: strh w8, [sp] ; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strh w9, [sp, #14] -; CHECK-NEXT: strh w10, [sp, #12] -; CHECK-NEXT: strh w8, [sp, #10] -; CHECK-NEXT: add x8, sp, #20 -; CHECK-NEXT: ldr d0, [sp, #8] +; CHECK-NEXT: strh w9, [sp, #6] +; CHECK-NEXT: strh w10, [sp, #4] +; CHECK-NEXT: strh w8, [sp, #2] +; CHECK-NEXT: add x8, sp, #12 +; CHECK-NEXT: ldr d0, [sp] ; CHECK-NEXT: st1b { z0.h }, p0, [x8] -; CHECK-NEXT: ldrh w8, [sp, #20] +; CHECK-NEXT: ldrh w8, [sp, #12] ; CHECK-NEXT: strb w10, [x19, #2] +; CHECK-NEXT: ldr x29, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: strh w8, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #48 @@ -130,14 +134,14 @@ ; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: mov x0, sp -; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: mov x20, sp ; CHECK-NEXT: bl def ; CHECK-NEXT: mov x8, #4 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [sp] ; CHECK-NEXT: ld2d { z2.d, z3.d }, p0/z, [x20, x8, lsl #3] -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: stp q0, q2, [x19] ; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96