Index: lib/Target/AArch64/AArch64CallingConvention.td =================================================================== --- lib/Target/AArch64/AArch64CallingConvention.td +++ lib/Target/AArch64/AArch64CallingConvention.td @@ -288,6 +288,12 @@ D8, D9, D10, D11, D12, D13, D14, D15)>; +// AArch64 PCS for vector functions (VPCS) +// must (additionally) preserve full Q8-Q23 registers +def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, + X23, X24, X25, X26, X27, X28, + (sequence "Q%u", 8, 23))>; + // Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since // 'this' and the pointer return value are both passed in X0 in these cases, // this can be partially modelled by treating X0 as a callee-saved register; @@ -362,5 +368,7 @@ : CalleeSavedRegs<(add CSR_AArch64_AAPCS_SwiftError, X18)>; def CSR_AArch64_RT_MostRegs_SCS : CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs, X18)>; +def CSR_AArch64_AAVPCS_SCS + : CalleeSavedRegs<(add CSR_AArch64_AAVPCS, X18)>; def CSR_AArch64_AAPCS_SCS : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X18)>; Index: lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.cpp +++ lib/Target/AArch64/AArch64FrameLowering.cpp @@ -435,12 +435,19 @@ NewOpc = AArch64::STPDpre; Scale = 8; break; + case AArch64::STPQi: + NewOpc = AArch64::STPQpre; + Scale = 16; + break; case AArch64::STRXui: NewOpc = AArch64::STRXpre; break; case AArch64::STRDui: NewOpc = AArch64::STRDpre; break; + case AArch64::STRQui: + NewOpc = AArch64::STRQpre; + break; case AArch64::LDPXi: NewOpc = AArch64::LDPXpost; Scale = 8; @@ -449,12 +456,19 @@ NewOpc = AArch64::LDPDpost; Scale = 8; break; + case AArch64::LDPQi: + NewOpc = AArch64::LDPQpost; + Scale = 16; + break; case AArch64::LDRXui: NewOpc = AArch64::LDRXpost; break; case AArch64::LDRDui: NewOpc = AArch64::LDRDpost; break; + case AArch64::LDRQui: + NewOpc = AArch64::LDRQpost; + break; } MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); @@ -506,6 +520,12 @@ case AArch64::LDRDui: Scale = 8; break; + case AArch64::STPQi: + case AArch64::STRQui: + case AArch64::LDPQi: + case AArch64::LDRQui: + Scale = 16; + break; default: llvm_unreachable("Unexpected callee-save save/restore opcode!"); } @@ -516,7 +536,7 @@ // Last operand is immediate offset that needs fixing. MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx); // All generated opcodes have scaled offsets. - assert(LocalStackSize % 8 == 0); + assert(LocalStackSize % 16 == 0); OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale); } @@ -1149,7 +1169,7 @@ unsigned Reg2 = AArch64::NoRegister; int FrameIdx; int Offset; - enum RegType { GPR, FPR64 } Type; + enum RegType { GPR, FPR64, FPR128 } Type; RegPairInfo() = default; @@ -1187,6 +1207,8 @@ RPI.Type = RegPairInfo::GPR; else if (AArch64::FPR64RegClass.contains(RPI.Reg1)) RPI.Type = RegPairInfo::FPR64; + else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) + RPI.Type = RegPairInfo::FPR128; else llvm_unreachable("Unsupported register class."); @@ -1202,6 +1224,10 @@ if (AArch64::FPR64RegClass.contains(NextReg)) RPI.Reg2 = NextReg; break; + case RegPairInfo::FPR128: + if (AArch64::FPR128RegClass.contains(NextReg)) + RPI.Reg2 = NextReg; + break; } } @@ -1235,17 +1261,21 @@ RPI.FrameIdx = CSI[i].getFrameIdx(); - if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) { - // Round up size of non-pair to pair size if we need to pad the - // callee-save area to ensure 16-byte alignment. - Offset -= 16; + int Scale = RPI.Type == RegPairInfo::FPR128 ? 16 : 8; + Offset -= RPI.isPaired() ? 2 * Scale : Scale; + + // Round up size of non-pair to pair size if we need to pad the + // callee-save area to ensure 16-byte alignment. + if (AFI->hasCalleeSaveStackFreeSpace() && + RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired()) { + Offset -= 8; + assert(Offset % 16 == 0); assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16); MFI.setObjectAlignment(RPI.FrameIdx, 16); - AFI->setCalleeSaveStackHasFreeSpace(true); - } else - Offset -= RPI.isPaired() ? 16 : 8; - assert(Offset % 8 == 0); - RPI.Offset = Offset / 8; + } + + assert(Offset % Scale == 0); + RPI.Offset = Offset / Scale; assert((RPI.Offset >= -64 && RPI.Offset <= 63) && "Offset out of bounds for LDP/STP immediate"); @@ -1311,6 +1341,11 @@ Size = RPI.isPaired() ? 16 : 8; Align = 8; break; + case RegPairInfo::FPR128: + StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; + Size = RPI.isPaired() ? 32 : 16; + Align = 16; + break; } LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -1382,6 +1417,11 @@ Size = RPI.isPaired() ? 16 : 8; Align = 8; break; + case RegPairInfo::FPR128: + LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui; + Size = RPI.isPaired() ? 32 : 16; + Align = 16; + break; } LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -1448,24 +1488,6 @@ ? RegInfo->getBaseRegister() : (unsigned)AArch64::NoRegister; - unsigned SpillEstimate = SavedRegs.count(); - for (unsigned i = 0; CSRegs[i]; ++i) { - unsigned Reg = CSRegs[i]; - unsigned PairedReg = CSRegs[i ^ 1]; - if (Reg == BasePointerReg) - SpillEstimate++; - if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) - SpillEstimate++; - } - SpillEstimate += 2; // Conservatively include FP+LR in the estimate - unsigned StackEstimate = MFI.estimateStackSize(MF) + 8 * SpillEstimate; - - // The frame record needs to be created by saving the appropriate registers - if (hasFP(MF) || windowsRequiresStackProbe(MF, StackEstimate)) { - SavedRegs.set(AArch64::FP); - SavedRegs.set(AArch64::LR); - } - unsigned ExtraCSSpill = 0; // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { @@ -1497,6 +1519,26 @@ } } + // Calculates the callee saved stack size. + auto getCSStackSize = [&CSRegs, &SavedRegs]() { + unsigned Size = 0; + for (unsigned i = 0; CSRegs[i]; ++i) + if (SavedRegs.test(CSRegs[i])) + Size += (AArch64::FPR64RegClass.contains(CSRegs[i]) || + AArch64::GPR64RegClass.contains(CSRegs[i])) + ? 8 : 16; + return Size; + }; + + // The frame record needs to be created by saving the appropriate registers + unsigned EstimatedStackSize = MFI.estimateStackSize(MF) + getCSStackSize(); + if (hasFP(MF) || + windowsRequiresStackProbe(MF, EstimatedStackSize + 16)) { + SavedRegs.set(AArch64::FP); + SavedRegs.set(AArch64::LR); + EstimatedStackSize += 16; + } + LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; for (unsigned Reg : SavedRegs.set_bits()) dbgs() @@ -1504,15 +1546,12 @@ dbgs() << "\n";); // If any callee-saved registers are used, the frame cannot be eliminated. - unsigned NumRegsSpilled = SavedRegs.count(); - bool CanEliminateFrame = NumRegsSpilled == 0; + bool CanEliminateFrame = SavedRegs.count() == 0; // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. - unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled; - LLVM_DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF); - bool BigStack = (CFSize > EstimatedStackSizeLimit); + bool BigStack = EstimatedStackSize > EstimatedStackSizeLimit; if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) AFI->setHasStackFrame(true); @@ -1533,7 +1572,6 @@ if (produceCompactUnwindFrame(MF)) SavedRegs.set(UnspilledCSGPRPaired); ExtraCSSpill = UnspilledCSGPRPaired; - NumRegsSpilled = SavedRegs.count(); } // If we didn't find an extra callee-saved register to spill, create @@ -1550,9 +1588,17 @@ } } + // Recalculate the size of the CSRs + unsigned CSStackSize = getCSStackSize(); + unsigned AlignedCSStackSize = alignTo(CSStackSize, 16); + LLVM_DEBUG(dbgs() << "Estimated stack frame size: " + << MFI.estimateStackSize(MF) + AlignedCSStackSize + << " bytes.\n"); + // Round up to register pair alignment to avoid additional SP adjustment // instructions. - AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16)); + AFI->setCalleeSavedStackSize(AlignedCSStackSize); + AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize); } bool AArch64FrameLowering::enableStackSlotScavenging( Index: lib/Target/AArch64/AArch64RegisterInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64RegisterInfo.cpp +++ lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -49,8 +49,7 @@ if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) return CSR_AArch64_AllRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall) - // FIXME: default to AAPCS until we add full support. - return CSR_AArch64_AAPCS_SaveList; + return CSR_AArch64_AAVPCS_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS) return MF->getInfo()->isSplitCSR() ? CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : @@ -101,8 +100,7 @@ return SCS ? CSR_AArch64_CXX_TLS_Darwin_SCS_RegMask : CSR_AArch64_CXX_TLS_Darwin_RegMask; if (CC == CallingConv::AArch64_VectorCall) - // FIXME: default to AAPCS until we add full support. - return SCS ? CSR_AArch64_AAPCS_SCS_RegMask : CSR_AArch64_AAPCS_RegMask; + return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask; if (MF.getSubtarget().getTargetLowering() ->supportSwiftError() && MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError)) Index: test/CodeGen/AArch64/aarch64-vector-pcs.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/aarch64-vector-pcs.ll @@ -0,0 +1,139 @@ +; The tests below test the allocation of 128bit callee-saves +; on the stack, specifically their offsets. + +; Padding of GPR64-registers is needed to ensure 16 byte alignment of +; the stack pointer after the GPR64/FPR64 block (which is also needed +; for the FPR128 saves when present). + +; This file also tests whether an emergency stack slot is allocated +; when the stack frame is over a given size, caused by a series of +; FPR128 saves. The alignment can leave a gap that can be scavenged +; for stack slot scavenging, so it is important that the stack size +; is properly estimated. + +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +; Check that the alignment gap for the 8-byte x19 is padded +; with another 8 bytes. The CSR region will look like this: +; +-------------------+ +; |/////padding///////| (8 bytes) +; | X19 | (8 bytes) +; +-------------------+ <- SP -16 +; | Q10, Q11 | (32 bytes) +; +-------------------+ <- SP -48 +define aarch64_vector_pcs void @test_q10_q11_x19() nounwind { +; CHECK-LABEL: test_q10_q11_x19 +; CHECK: stp q11, q10, [sp, #-48]! +; CHECK-NEXT: str x19, [sp, #32] + call void asm sideeffect "nop", "~{x19}"() + call void asm sideeffect "nop", "~{q10},~{q11}"() + ret void +} + +; +-------------------+ +; | X19, X20 | (16 bytes) +; +-------------------+ <- SP -16 +; | Q10, Q11 | (32 bytes) +; +-------------------+ <- SP -48 +define aarch64_vector_pcs void @test_q10_q11_x19_x20() nounwind { +; CHECK-LABEL: test_q10_q11_x19_x20 +; CHECK: stp q11, q10, [sp, #-48]! +; CHECK-NEXT: stp x20, x19, [sp, #32] + call void asm sideeffect "nop", "~{x19},~{x20}"() + call void asm sideeffect "nop", "~{q10},~{q11}"() + ret void +} + +; Check that the alignment gap is padded with another 8 bytes. +; The CSR region will look like this: +; +-------------------+ +; | X19, X20 | (16 bytes) +; +-------------------+ <- SP -16 +; |/////padding///////| (8 bytes) +; | X21 | (8 bytes) +; +-------------------+ <- SP -32 +; | Q10, Q11 | (32 bytes) +; +-------------------+ <- SP -64 +define aarch64_vector_pcs void @test_q10_q11_x19_x20_x21() nounwind { +; CHECK-LABEL: test_q10_q11_x19_x20_x21 +; CHECK: stp q11, q10, [sp, #-64]! +; CHECK-NEXT: str x21, [sp, #32] +; CHECK-NEXT: stp x20, x19, [sp, #48] + call void asm sideeffect "nop", "~{x19},~{x20},~{x21}"() + call void asm sideeffect "nop", "~{q10},~{q11}"() + ret void +} + +; Test with more callee saves, which triggers 'BigStack' in +; AArch64FrameLowering which in turn causes an emergency spill +; slot to be allocated. The emergency spill slot is allocated +; as close as possible to SP, so at SP + 0. +; +-------------------+ +; | X19..X30 | (96 bytes) +; +-------------------+ <- SP -96 +; | Q8..Q23 | (256 bytes) +; +-------------------+ <- SP -352 +; | emergency slot | (16 bytes) +; +-------------------+ <- SP -368 +define aarch64_vector_pcs void @test_q8_to_q23_x19_to_x30() nounwind { +; CHECK-LABEL: test_q8_to_q23_x19_to_x30 +; CHECK: sub sp, sp, #368 +; CHECK-NEXT: stp q23, q22, [sp, #16] // 32-byte Folded Spill +; CHECK-NEXT: stp q21, q20, [sp, #48] // 32-byte Folded Spill +; CHECK-NEXT: stp q19, q18, [sp, #80] // 32-byte Folded Spill +; CHECK-NEXT: stp q17, q16, [sp, #112] // 32-byte Folded Spill +; CHECK-NEXT: stp q15, q14, [sp, #144] // 32-byte Folded Spill +; CHECK-NEXT: stp q13, q12, [sp, #176] // 32-byte Folded Spill +; CHECK-NEXT: stp q11, q10, [sp, #208] // 32-byte Folded Spill +; CHECK-NEXT: stp q9, q8, [sp, #240] // 32-byte Folded Spill +; CHECK-NEXT: stp x28, x27, [sp, #272] // 16-byte Folded Spill +; CHECK-NEXT: stp x26, x25, [sp, #288] // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #304] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #320] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #336] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #352] // 16-byte Folded Spill + call void asm sideeffect "nop", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{lr},~{fp}"() + call void asm sideeffect "nop", "~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23}"() + ret void +} + +; When the total stack size >= 512, it will use the pre-increment +; rather than the 'sub sp, sp, '. +; +-------------------+ +; | X19..X30 | (96 bytes) +; +-------------------+ <- SP -96 +; | Q8..Q23 | (256 bytes) +; +-------------------+ <- SP -352 +; | 'obj' | (32 bytes) +; +-------------------+ <- SP -384 +; | emergency slot | (16 bytes) +; +-------------------+ <- SP -400 +define aarch64_vector_pcs void @test_q8_to_q23_x19_to_x30_preinc() nounwind { +; CHECK-LABEL: test_q8_to_q23_x19_to_x30_preinc +; CHECK: stp q23, q22, [sp, #-352]! // 32-byte Folded Spill +; CHECK-NEXT: stp q21, q20, [sp, #32] // 32-byte Folded Spill +; CHECK-NEXT: stp q19, q18, [sp, #64] // 32-byte Folded Spill +; CHECK-NEXT: stp q17, q16, [sp, #96] // 32-byte Folded Spill +; CHECK-NEXT: stp q15, q14, [sp, #128] // 32-byte Folded Spill +; CHECK-NEXT: stp q13, q12, [sp, #160] // 32-byte Folded Spill +; CHECK-NEXT: stp q11, q10, [sp, #192] // 32-byte Folded Spill +; CHECK-NEXT: stp q9, q8, [sp, #224] // 32-byte Folded Spill +; CHECK-NEXT: stp x28, x27, [sp, #256] // 16-byte Folded Spill +; CHECK-NEXT: stp x26, x25, [sp, #272] // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #288] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #304] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #320] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #336] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #176 // =176 +; CHECK-NEXT: mov w[[IMM:[0-9]+]], #42 +; CHECK-NEXT: strb w[[IMM]], [sp, #16] +; CHECK-NEXT: strb w[[IMM]], [sp, #47] + %obj = alloca [ 160 x i8 ] + %first = getelementptr [ 160 x i8 ], [ 160 x i8 ]* %obj, i32 0, i32 0 + %last = getelementptr [ 160 x i8 ], [ 160 x i8 ]* %obj, i32 0, i32 31 + store i8 42, i8* %first + store i8 42, i8* %last + call void asm sideeffect "nop", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{lr},~{fp}"() + call void asm sideeffect "nop", "~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23}"() + ret void +}