diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -309,6 +309,13 @@ RegScavenger *RS = nullptr) const { } + /// processFunctionBeforeFrameIndicesReplaced - This method is called + /// immediately before MO_FrameIndex operands are eliminated, but after the + /// frame is finalized. This method is optional. + virtual void + processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, + RegScavenger *RS = nullptr) const {} + virtual unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const { report_fatal_error("WinEH not implemented for this target"); } diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -259,6 +259,10 @@ for (auto &I : EntryDbgValues) I.first->insert(I.first->begin(), I.second.begin(), I.second.end()); + // Allow the target machine to make final modifications to the function + // before the frame layout is finalized. + TFI->processFunctionBeforeFrameIndicesReplaced(MF, RS); + // Replace all MO_FrameIndex operands with physical register references // and actual offsets. // diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -349,22 +349,38 @@ MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); - Register SizeReg = MI.getOperand(2).getReg(); - Register AddressReg = MI.getOperand(3).getReg(); + Register SizeReg = MI.getOperand(0).getReg(); + Register AddressReg = MI.getOperand(1).getReg(); MachineFunction *MF = MBB.getParent(); - bool ZeroData = MI.getOpcode() == AArch64::STZGloop; - const unsigned OpCode = + bool ZeroData = MI.getOpcode() == AArch64::STZGloop_wback; + const unsigned OpCode1 = + ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex; + const unsigned OpCode2 = ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex; + unsigned Size = MI.getOperand(2).getImm(); + assert(Size > 0 && Size % 16 == 0); + if (Size % (16 * 2) != 0) { + BuildMI(MBB, MBBI, DL, TII->get(OpCode1), AddressReg) + .addReg(AddressReg) + .addReg(AddressReg) + .addImm(1); + Size -= 16; + } + MachineBasicBlock::iterator I = + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), SizeReg) + .addImm(Size); + expandMOVImm(MBB, I, 64); + auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); MF->insert(++MBB.getIterator(), LoopBB); MF->insert(++LoopBB->getIterator(), DoneBB); - BuildMI(LoopBB, DL, TII->get(OpCode)) + BuildMI(LoopBB, DL, TII->get(OpCode2)) .addDef(AddressReg) .addReg(AddressReg) .addReg(AddressReg) @@ -706,9 +722,14 @@ MI.eraseFromParent(); return true; } + case AArch64::STGloop_wback: + case AArch64::STZGloop_wback: + return expandSetTagLoop(MBB, MBBI, NextMBBI); case AArch64::STGloop: case AArch64::STZGloop: - return expandSetTagLoop(MBB, MBBI, NextMBBI); + report_fatal_error( + "Non-writeback variants of STGloop / STZGloop should not " + "survive past PrologEpilogInserter."); } return false; } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -77,6 +77,10 @@ void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const override; + void + processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, + RegScavenger *RS) const override; + unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override; unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const; @@ -107,6 +111,8 @@ int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF, int &MinCSFrameIndex, int &MaxCSFrameIndex) const; + bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB, + unsigned StackBumpBytes) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -170,6 +170,11 @@ cl::desc("reverse the CSR restore sequence"), cl::init(false), cl::Hidden); +static cl::opt StackTaggingMergeSetTag( + "stack-tagging-merge-settag", + cl::desc("merge settag instruction in function epilog"), cl::init(true), + cl::Hidden); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); /// This is the biggest offset to the stack pointer we can encode in aarch64 @@ -480,6 +485,39 @@ return true; } +bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue( + MachineBasicBlock &MBB, unsigned StackBumpBytes) const { + if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes)) + return false; + + if (MBB.empty()) + return true; + + // Disable combined SP bump if the last instruction is an MTE tag store. It + // is almost always better to merge SP adjustment into those instructions. + MachineBasicBlock::iterator LastI = MBB.getFirstTerminator(); + MachineBasicBlock::iterator Begin = MBB.begin(); + while (LastI != Begin) { + --LastI; + if (LastI->isTransient()) + continue; + if (!LastI->getFlag(MachineInstr::FrameDestroy)) + break; + } + switch (LastI->getOpcode()) { + case AArch64::STGloop: + case AArch64::STZGloop: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + return false; + default: + return true; + } + llvm_unreachable("unreachable"); +} + // Given a load or a store instruction, generate an appropriate unwinding SEH // code on Windows. static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, @@ -1459,7 +1497,7 @@ // function. if (MF.hasEHFunclets()) AFI->setLocalStackSize(NumBytes - PrologueSaveSize); - bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes); // Assume we can't combine the last pop with the sp restore. if (!CombineSPBump && PrologueSaveSize != 0) { @@ -2637,9 +2675,399 @@ .addImm(0); } -/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before -/// the update. This is easily retrieved as it is exactly the offset that is set -/// in processFunctionBeforeFrameFinalized. +namespace { +struct TagStoreInstr { + MachineInstr *MI; + int64_t Offset, Size; + explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size) + : MI(MI), Offset(Offset), Size(Size) {} +}; + +class TagStoreEdit { + MachineFunction *MF; + MachineBasicBlock *MBB; + MachineRegisterInfo *MRI; + // Tag store instructions that are being replaced. + SmallVector TagStores; + // Combined memref arguments of the above instructions. + SmallVector CombinedMemRefs; + + // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg + + // FrameRegOffset + Size) with the address tag of SP. + Register FrameReg; + StackOffset FrameRegOffset; + int64_t Size; + // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end. + Optional FrameRegUpdate; + // MIFlags for any FrameReg updating instructions. + unsigned FrameRegUpdateFlags; + + // Use zeroing instruction variants. + bool ZeroData; + DebugLoc DL; + + void emitUnrolled(MachineBasicBlock::iterator InsertI); + void emitLoop(MachineBasicBlock::iterator InsertI); + +public: + TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData) + : MBB(MBB), ZeroData(ZeroData) { + MF = MBB->getParent(); + MRI = &MF->getRegInfo(); + } + // Add an instruction to be replaced. Instructions must be added in the + // ascending order of Offset, and have to be adjacent. + void addInstruction(TagStoreInstr I) { + assert((TagStores.empty() || + TagStores.back().Offset + TagStores.back().Size == I.Offset) && + "Non-adjacent tag store instructions."); + TagStores.push_back(I); + } + void clear() { TagStores.clear(); } + // Emit equivalent code at the given location, and erase the current set of + // instructions. May skip if the replacement is not profitable. May invalidate + // the input iterator and replace it with a valid one. + void emitCode(MachineBasicBlock::iterator &InsertI, + const AArch64FrameLowering *TFI, bool IsLast); +}; + +void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) { + const AArch64InstrInfo *TII = + MF->getSubtarget().getInstrInfo(); + + const int64_t kMinOffset = -256 * 16; + const int64_t kMaxOffset = 255 * 16; + + Register BaseReg = FrameReg; + int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes(); + if (BaseRegOffsetBytes < kMinOffset || + BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) { + Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg, + {BaseRegOffsetBytes, MVT::i8}, TII); + BaseReg = ScratchReg; + BaseRegOffsetBytes = 0; + } + + MachineInstr *LastI = nullptr; + while (Size) { + int64_t InstrSize = (Size > 16) ? 32 : 16; + unsigned Opcode = + InstrSize == 16 + ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset) + : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset); + MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode)) + .addReg(AArch64::SP) + .addReg(BaseReg) + .addImm(BaseRegOffsetBytes / 16) + .setMemRefs(CombinedMemRefs); + // A store to [BaseReg, #0] should go last for an opportunity to fold the + // final SP adjustment in the epilogue. + if (BaseRegOffsetBytes == 0) + LastI = I; + BaseRegOffsetBytes += InstrSize; + Size -= InstrSize; + } + + if (LastI) + MBB->splice(InsertI, MBB, LastI); +} + +void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) { + const AArch64InstrInfo *TII = + MF->getSubtarget().getInstrInfo(); + + Register BaseReg = FrameRegUpdate + ? FrameReg + : MRI->createVirtualRegister(&AArch64::GPR64RegClass); + Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + + emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII); + + int64_t LoopSize = Size; + // If the loop size is not a multiple of 32, split off one 16-byte store at + // the end to fold BaseReg update into. + if (FrameRegUpdate && *FrameRegUpdate) + LoopSize -= LoopSize % 32; + MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL, + TII->get(ZeroData ? AArch64::STZGloop_wback + : AArch64::STGloop_wback)) + .addDef(SizeReg) + .addDef(BaseReg) + .addImm(LoopSize) + .addReg(BaseReg) + .setMemRefs(CombinedMemRefs); + if (FrameRegUpdate) + LoopI->setFlags(FrameRegUpdateFlags); + + int64_t ExtraBaseRegUpdate = + FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0; + if (LoopSize < Size) { + assert(FrameRegUpdate); + assert(Size - LoopSize == 16); + // Tag 16 more bytes at BaseReg and update BaseReg. + BuildMI(*MBB, InsertI, DL, + TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex)) + .addDef(BaseReg) + .addReg(BaseReg) + .addReg(BaseReg) + .addImm(1 + ExtraBaseRegUpdate / 16) + .setMemRefs(CombinedMemRefs) + .setMIFlags(FrameRegUpdateFlags); + } else if (ExtraBaseRegUpdate) { + // Update BaseReg. + BuildMI( + *MBB, InsertI, DL, + TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri)) + .addDef(BaseReg) + .addReg(BaseReg) + .addImm(std::abs(ExtraBaseRegUpdate)) + .addImm(0) + .setMIFlags(FrameRegUpdateFlags); + } +} + +// Check if *II is a register update that can be merged into STGloop that ends +// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the +// end of the loop. +bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg, + int64_t Size, int64_t *TotalOffset) { + MachineInstr &MI = *II; + if ((MI.getOpcode() == AArch64::ADDXri || + MI.getOpcode() == AArch64::SUBXri) && + MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) { + unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm()); + int64_t Offset = MI.getOperand(2).getImm() << Shift; + if (MI.getOpcode() == AArch64::SUBXri) + Offset = -Offset; + int64_t AbsPostOffset = std::abs(Offset - Size); + const int64_t kMaxOffset = + 0xFFF; // Max encoding for unshifted ADDXri / SUBXri + if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) { + *TotalOffset = Offset; + return true; + } + } + return false; +} + +void mergeMemRefs(const SmallVectorImpl &TSE, + SmallVectorImpl &MemRefs) { + MemRefs.clear(); + for (auto &TS : TSE) { + MachineInstr *MI = TS.MI; + // An instruction without memory operands may access anything. Be + // conservative and return an empty list. + if (MI->memoperands_empty()) { + MemRefs.clear(); + return; + } + MemRefs.append(MI->memoperands_begin(), MI->memoperands_end()); + } +} + +void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, + const AArch64FrameLowering *TFI, bool IsLast) { + if (TagStores.empty()) + return; + TagStoreInstr &FirstTagStore = TagStores[0]; + TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1]; + Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size; + DL = TagStores[0].MI->getDebugLoc(); + + unsigned Reg; + FrameRegOffset = TFI->resolveFrameOffsetReference( + *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg, + /*PreferFP=*/false, /*ForSimm=*/true); + FrameReg = Reg; + FrameRegUpdate = None; + + mergeMemRefs(TagStores, CombinedMemRefs); + + LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n"; + for (const auto &Instr + : TagStores) { dbgs() << " " << *Instr.MI; }); + + // Size threshold where a loop becomes shorter than a linear sequence of + // tagging instructions. + const int kSetTagLoopThreshold = 176; + if (Size < kSetTagLoopThreshold) { + if (TagStores.size() < 2) + return; + emitUnrolled(InsertI); + } else { + MachineInstr *UpdateInstr = nullptr; + int64_t TotalOffset; + if (IsLast) { + // See if we can merge base register update into the STGloop. + // This is done in AArch64LoadStoreOptimizer for "normal" stores, + // but STGloop is way too unusual for that, and also it only + // realistically happens in function epilogue. Also, STGloop is expanded + // before that pass. + if (InsertI != MBB->end() && + canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size, + &TotalOffset)) { + UpdateInstr = &*InsertI++; + LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n " + << *UpdateInstr); + } + } + + if (!UpdateInstr && TagStores.size() < 2) + return; + + if (UpdateInstr) { + FrameRegUpdate = TotalOffset; + FrameRegUpdateFlags = UpdateInstr->getFlags(); + } + emitLoop(InsertI); + if (UpdateInstr) + UpdateInstr->eraseFromParent(); + } + + for (auto &TS : TagStores) + TS.MI->eraseFromParent(); +} + +bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset, + int64_t &Size, bool &ZeroData) { + MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + unsigned Opcode = MI.getOpcode(); + ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset || + Opcode == AArch64::STZ2GOffset); + + if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) { + if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead()) + return false; + if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI()) + return false; + Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex()); + Size = MI.getOperand(2).getImm(); + return true; + } + + if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset) + Size = 16; + else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset) + Size = 32; + else + return false; + + if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI()) + return false; + + Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) + + 16 * MI.getOperand(2).getImm(); + return true; +} + +// Detect a run of memory tagging instructions for adjacent stack frame slots, +// and replace them with a shorter instruction sequence: +// * replace STG + STG with ST2G +// * replace STGloop + STGloop with STGloop +// This code needs to run when stack slot offsets are already known, but before +// FrameIndex operands in STG instructions are eliminated. +MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II, + const AArch64FrameLowering *TFI, + RegScavenger *RS) { + bool FirstZeroData; + int64_t Size, Offset; + MachineInstr &MI = *II; + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator NextI = ++II; + if (&MI == &MBB->instr_back()) + return II; + if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData)) + return II; + + SmallVector Instrs; + Instrs.emplace_back(&MI, Offset, Size); + + constexpr int kScanLimit = 10; + int Count = 0; + for (MachineBasicBlock::iterator E = MBB->end(); + NextI != E && Count < kScanLimit; ++NextI) { + MachineInstr &MI = *NextI; + bool ZeroData; + int64_t Size, Offset; + // Collect instructions that update memory tags with a FrameIndex operand + // and (when applicable) constant size, and whose output registers are dead + // (the latter is almost always the case in practice). Since these + // instructions effectively have no inputs or outputs, we are free to skip + // any non-aliasing instructions in between without tracking used registers. + if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) { + if (ZeroData != FirstZeroData) + break; + Instrs.emplace_back(&MI, Offset, Size); + continue; + } + + // Only count non-transient, non-tagging instructions toward the scan + // limit. + if (!MI.isTransient()) + ++Count; + + // Just in case, stop before the epilogue code starts. + if (MI.getFlag(MachineInstr::FrameSetup) || + MI.getFlag(MachineInstr::FrameDestroy)) + break; + + // Reject anything that may alias the collected instructions. + if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects()) + break; + } + + // New code will be inserted after the last tagging instruction we've found. + MachineBasicBlock::iterator InsertI = Instrs.back().MI; + InsertI++; + + llvm::stable_sort(Instrs, + [](const TagStoreInstr &Left, const TagStoreInstr &Right) { + return Left.Offset < Right.Offset; + }); + + // Make sure that we don't have any overlapping stores. + int64_t CurOffset = Instrs[0].Offset; + for (auto &Instr : Instrs) { + if (CurOffset > Instr.Offset) + return NextI; + CurOffset = Instr.Offset + Instr.Size; + } + + // Find contiguous runs of tagged memory and emit shorter instruction + // sequencies for them when possible. + TagStoreEdit TSE(MBB, FirstZeroData); + Optional EndOffset; + for (auto &Instr : Instrs) { + if (EndOffset && *EndOffset != Instr.Offset) { + // Found a gap. + TSE.emitCode(InsertI, TFI, /*IsLast = */ false); + TSE.clear(); + } + + TSE.addInstruction(Instr); + EndOffset = Instr.Offset + Instr.Size; + } + + TSE.emitCode(InsertI, TFI, /*IsLast = */ true); + + return InsertI; +} +} // namespace + +void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( + MachineFunction &MF, RegScavenger *RS = nullptr) const { + if (StackTaggingMergeSetTag) + for (auto &BB : MF) + for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) + II = tryMergeAdjacentSTG(II, this, RS); +} + +/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP +/// before the update. This is easily retrieved as it is exactly the offset +/// that is set in processFunctionBeforeFrameFinalized. int AArch64FrameLowering::getFrameIndexReferencePreferSP( const MachineFunction &MF, int FI, unsigned &FrameReg, bool IgnoreSPUpdates) const { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3458,6 +3458,8 @@ case AArch64::ST1Fourv1d: case AArch64::IRG: case AArch64::IRGstack: + case AArch64::STGloop: + case AArch64::STZGloop: return AArch64FrameOffsetCannotUpdate; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1514,17 +1514,29 @@ // register / expression for the tagged base pointer of the current function. def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>; -// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address. -// $Rn_wback is one past the end of the range. +// Large STG to be expanded into a loop. $sz is the size, $Rn is start address. +// $Rn_wback is one past the end of the range. $Rm is the loop counter. let isCodeGenOnly=1, mayStore=1 in { +def STGloop_wback + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >, + Sched<[WriteAdr, WriteST]>; + +def STZGloop_wback + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >, + Sched<[WriteAdr, WriteST]>; + +// A variant of the above where $Rn2 is an independent register not tied to the input register $Rn. +// Their purpose is to use a FrameIndex operand as $Rn (which of course can not be written back). def STGloop - : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn), - [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >, + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn), + [], "@earlyclobber $Rn2,@earlyclobber $Rm" >, Sched<[WriteAdr, WriteST]>; def STZGloop - : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn), - [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >, + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn), + [], "@earlyclobber $Rn2,@earlyclobber $Rm" >, Sched<[WriteAdr, WriteST]>; } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -390,6 +390,10 @@ if (isFrameOffsetLegal(MI, AArch64::SP, Offset)) return false; + // If even offset 0 is illegal, we don't want a virtual base register. + if (!isFrameOffsetLegal(MI, AArch64::SP, 0)) + return false; + // The offset likely isn't legal; we want to allocate a virtual base register. return true; } @@ -445,6 +449,27 @@ (void)Done; } +// Create a scratch register for the frame index elimination in an instruction. +// This function has special handling of stack tagging loop pseudos, in which +// case it can also change the instruction opcode (but not the operands). +static Register +createScratchRegisterForInstruction(MachineInstr &MI, + const AArch64InstrInfo *TII) { + // ST*Gloop have a reserved scratch register in operand 1. Use it, and also + // replace the instruction with the writeback variant because it will now + // satisfy the operand constraints for it. + if (MI.getOpcode() == AArch64::STGloop) { + MI.setDesc(TII->get(AArch64::STGloop_wback)); + return MI.getOperand(1).getReg(); + } else if (MI.getOpcode() == AArch64::STZGloop) { + MI.setDesc(TII->get(AArch64::STZGloop_wback)); + return MI.getOperand(1).getReg(); + } else { + return MI.getMF()->getRegInfo().createVirtualRegister( + &AArch64::GPR64RegClass); + } +} + void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { @@ -531,8 +556,7 @@ // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above. Handle the rest, providing a register that is // SP+LargeImm. - Register ScratchReg = - MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + Register ScratchReg = createScratchRegisterForInstruction(MI, TII); emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); } diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -125,21 +125,18 @@ return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand, ZeroData); - if (ObjSize % 32 != 0) { - SDNode *St1 = DAG.getMachineNode( - ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl, - {MVT::i64, MVT::Other}, - {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain}); - DAG.setNodeMemRefs(cast(St1), {BaseMemOperand}); - ObjSize -= 16; - Addr = SDValue(St1, 0); - Chain = SDValue(St1, 1); - } - const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other}; - SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain}; - SDNode *St = DAG.getMachineNode( - ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops); + + unsigned Opcode; + if (Addr.getOpcode() == ISD::FrameIndex) { + int FI = cast(Addr)->getIndex(); + Addr = DAG.getTargetFrameIndex(FI, MVT::i64); + Opcode = ZeroData ? AArch64::STZGloop : AArch64::STGloop; + } else { + Opcode = ZeroData ? AArch64::STZGloop_wback : AArch64::STGloop_wback; + } + SDValue Ops[] = {DAG.getTargetConstant(ObjSize, dl, MVT::i64), Addr, Chain}; + SDNode *St = DAG.getMachineNode(Opcode, dl, ResTys, Ops); DAG.setNodeMemRefs(cast(St), {BaseMemOperand}); return SDValue(St, 2); diff --git a/llvm/test/CodeGen/AArch64/settag-merge.ll b/llvm/test/CodeGen/AArch64/settag-merge.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/settag-merge.ll @@ -0,0 +1,214 @@ +; RUN: llc < %s -mtriple=aarch64 -mattr=+mte | FileCheck %s + +declare void @use(i8* %p) +declare void @llvm.aarch64.settag(i8* %p, i64 %a) +declare void @llvm.aarch64.settag.zero(i8* %p, i64 %a) + +define void @stg16_16() { +entry: +; CHECK-LABEL: stg16_16: +; CHECK: st2g sp, [sp], #32 +; CHECK: ret + %a = alloca i8, i32 16, align 16 + %b = alloca i8, i32 16, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 16) + call void @llvm.aarch64.settag(i8* %b, i64 16) + ret void +} + +define i32 @stg16_16_16_16_ret() { +entry: +; CHECK-LABEL: stg16_16_16_16_ret: +; CHECK: st2g sp, [sp, #32] +; CHECK: st2g sp, [sp], #64 +; CHECK: mov w0, wzr +; CHECK: ret + %a = alloca i8, i32 16, align 16 + %b = alloca i8, i32 16, align 16 + %c = alloca i8, i32 16, align 16 + %d = alloca i8, i32 16, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 16) + call void @llvm.aarch64.settag(i8* %b, i64 16) + call void @llvm.aarch64.settag(i8* %c, i64 16) + call void @llvm.aarch64.settag(i8* %d, i64 16) + ret i32 0 +} + +define void @stg16_16_16_16() { +entry: +; CHECK-LABEL: stg16_16_16_16: +; CHECK: st2g sp, [sp, #32] +; CHECK: st2g sp, [sp], #64 +; CHECK: ret + %a = alloca i8, i32 16, align 16 + %b = alloca i8, i32 16, align 16 + %c = alloca i8, i32 16, align 16 + %d = alloca i8, i32 16, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 16) + call void @llvm.aarch64.settag(i8* %b, i64 16) + call void @llvm.aarch64.settag(i8* %c, i64 16) + call void @llvm.aarch64.settag(i8* %d, i64 16) + ret void +} + +define void @stg128_128_128_128() { +entry: +; CHECK-LABEL: stg128_128_128_128: +; CHECK: mov x8, #512 +; CHECK: st2g sp, [sp], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: ret + %a = alloca i8, i32 128, align 16 + %b = alloca i8, i32 128, align 16 + %c = alloca i8, i32 128, align 16 + %d = alloca i8, i32 128, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 128) + call void @llvm.aarch64.settag(i8* %b, i64 128) + call void @llvm.aarch64.settag(i8* %c, i64 128) + call void @llvm.aarch64.settag(i8* %d, i64 128) + ret void +} + +define void @stg16_512_16() { +entry: +; CHECK-LABEL: stg16_512_16: +; CHECK: mov x8, #544 +; CHECK: st2g sp, [sp], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: ret + %a = alloca i8, i32 16, align 16 + %b = alloca i8, i32 512, align 16 + %c = alloca i8, i32 16, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 16) + call void @llvm.aarch64.settag(i8* %b, i64 512) + call void @llvm.aarch64.settag(i8* %c, i64 16) + ret void +} + +define void @stg512_512_512() { +entry: +; CHECK-LABEL: stg512_512_512: +; CHECK: mov x8, #1536 +; CHECK: st2g sp, [sp], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: ret + %a = alloca i8, i32 512, align 16 + %b = alloca i8, i32 512, align 16 + %c = alloca i8, i32 512, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 512) + call void @llvm.aarch64.settag(i8* %b, i64 512) + call void @llvm.aarch64.settag(i8* %c, i64 512) + ret void +} + +define void @early(i1 %flag) { +entry: +; CHECK-LABEL: early: +; CHECK: tbz w0, #0, [[LABEL:.LBB.*]] +; CHECK: st2g sp, [sp, # +; CHECK: st2g sp, [sp, # +; CHECK: st2g sp, [sp, # +; CHECK: [[LABEL]]: +; CHECK: stg sp, [sp, # +; CHECK: st2g sp, [sp], # +; CHECK: ret + %a = alloca i8, i32 48, align 16 + %b = alloca i8, i32 48, align 16 + %c = alloca i8, i32 48, align 16 + br i1 %flag, label %if.then, label %if.end + +if.then: + call void @llvm.aarch64.settag(i8* %a, i64 48) + call void @llvm.aarch64.settag(i8* %b, i64 48) + br label %if.end + +if.end: + call void @llvm.aarch64.settag(i8* %c, i64 48) + ret void +} + +define void @early_128_128(i1 %flag) { +entry: +; CHECK-LABEL: early_128_128: +; CHECK: tbz w0, #0, [[LABEL:.LBB.*]] +; CHECK: add x9, sp, # +; CHECK: mov x8, #256 +; CHECK: st2g x9, [x9], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: [[LABEL]]: +; CHECK: stg sp, [sp, # +; CHECK: st2g sp, [sp], # +; CHECK: ret + %a = alloca i8, i32 128, align 16 + %b = alloca i8, i32 128, align 16 + %c = alloca i8, i32 48, align 16 + br i1 %flag, label %if.then, label %if.end + +if.then: + call void @llvm.aarch64.settag(i8* %a, i64 128) + call void @llvm.aarch64.settag(i8* %b, i64 128) + br label %if.end + +if.end: + call void @llvm.aarch64.settag(i8* %c, i64 48) + ret void +} + +define void @early_512_512(i1 %flag) { +entry: +; CHECK-LABEL: early_512_512: +; CHECK: tbz w0, #0, [[LABEL:.LBB.*]] +; CHECK: add x9, sp, # +; CHECK: mov x8, #1024 +; CHECK: st2g x9, [x9], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: [[LABEL]]: +; CHECK: stg sp, [sp, # +; CHECK: st2g sp, [sp], # +; CHECK: ret + %a = alloca i8, i32 512, align 16 + %b = alloca i8, i32 512, align 16 + %c = alloca i8, i32 48, align 16 + br i1 %flag, label %if.then, label %if.end + +if.then: + call void @llvm.aarch64.settag(i8* %a, i64 512) + call void @llvm.aarch64.settag(i8* %b, i64 512) + br label %if.end + +if.end: + call void @llvm.aarch64.settag(i8* %c, i64 48) + ret void +} + +; Two loops of size 256; the second loop updates SP. +define void @stg128_128_gap_128_128() { +entry: +; CHECK-LABEL: stg128_128_gap_128_128: +; CHECK: mov x9, sp +; CHECK: mov x8, #256 +; CHECK: st2g x9, [x9], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: mov x8, #256 +; CHECK: st2g sp, [sp], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: ret + %a = alloca i8, i32 128, align 16 + %a2 = alloca i8, i32 128, align 16 + %b = alloca i8, i32 32, align 16 + %c = alloca i8, i32 128, align 16 + %c2 = alloca i8, i32 128, align 16 + call void @use(i8* %b) + call void @llvm.aarch64.settag(i8* %a, i64 128) + call void @llvm.aarch64.settag(i8* %a2, i64 128) + call void @llvm.aarch64.settag(i8* %c, i64 128) + call void @llvm.aarch64.settag(i8* %c2, i64 128) + ret void +} diff --git a/llvm/test/CodeGen/AArch64/settag-merge.mir b/llvm/test/CodeGen/AArch64/settag-merge.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/settag-merge.mir @@ -0,0 +1,83 @@ +# RUN: llc -mtriple=aarch64 -mattr=+mte -run-pass=prologepilog %s -o - | FileCheck %s + +--- | + declare void @llvm.aarch64.settag(i8* nocapture writeonly, i64) argmemonly nounwind writeonly "target-features"="+mte" + define i32 @stg16_16_16_16_ret() "target-features"="+mte" { + entry: + %a = alloca i8, i32 16, align 16 + %b = alloca i8, i32 16, align 16 + %c = alloca i8, i32 16, align 16 + %d = alloca i8, i32 16, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 16) + call void @llvm.aarch64.settag(i8* %b, i64 16) + call void @llvm.aarch64.settag(i8* %c, i64 16) + call void @llvm.aarch64.settag(i8* %d, i64 16) + ret i32 0 + } + + define void @stg16_store_128() "target-features"="+mte" { + entry: + %a = alloca i8, i32 16, align 16 + %b = alloca i8, i32 128, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 16) + store i8 42, i8* %a + call void @llvm.aarch64.settag(i8* %b, i64 128) + ret void + } + +... +--- +# A sequence of STG with a register copy in the middle. +# Can be merged into ST2G + ST2G. +# CHECK-LABEL: name:{{.*}}stg16_16_16_16_ret +# CHECK-DAG: ST2GOffset $sp, $sp, 2 +# CHECK-DAG: ST2GOffset $sp, $sp, 0 +# CHECK-DAG: $w0 = COPY $wzr +# CHECK-DAG: RET_ReallyLR implicit killed $w0 + +name: stg16_16_16_16_ret +tracksRegLiveness: true +stack: + - { id: 0, name: a, size: 16, alignment: 16 } + - { id: 1, name: b, size: 16, alignment: 16 } + - { id: 2, name: c, size: 16, alignment: 16 } + - { id: 3, name: d, size: 16, alignment: 16 } +body: | + bb.0.entry: + STGOffset $sp, %stack.0.a, 0 :: (store 16 into %ir.a) + STGOffset $sp, %stack.1.b, 0 :: (store 16 into %ir.b) + STGOffset $sp, %stack.2.c, 0 :: (store 16 into %ir.c) + $w0 = COPY $wzr + STGOffset $sp, %stack.3.d, 0 :: (store 16 into %ir.d) + RET_ReallyLR implicit killed $w0 + +... + +--- +# A store in the middle prevents merging. +# CHECK-LABEL: name:{{.*}}stg16_store_128 +# CHECK: ST2GOffset $sp, $sp, 2 +# CHECK: ST2GOffset $sp, $sp, 4 +# CHECK: ST2GOffset $sp, $sp, 6 +# CHECK: STGOffset $sp, $sp, 8 +# CHECK: STRBBui +# CHECK: ST2GOffset $sp, $sp, 0 +# CHECK: RET_ReallyLR + +name: stg16_store_128 +tracksRegLiveness: true +stack: + - { id: 0, name: a, size: 16, alignment: 16 } + - { id: 1, name: b, size: 128, alignment: 16 } +body: | + bb.0.entry: + STGOffset $sp, %stack.0.a, 0 :: (store 16 into %ir.a) + renamable $w8 = MOVi32imm 42 + ST2GOffset $sp, %stack.1.b, 6 :: (store 32 into %ir.b + 96, align 16) + ST2GOffset $sp, %stack.1.b, 4 :: (store 32 into %ir.b + 64, align 16) + ST2GOffset $sp, %stack.1.b, 2 :: (store 32 into %ir.b + 32, align 16) + STRBBui killed renamable $w8, %stack.0.a, 0 :: (store 1 into %ir.a, align 16) + ST2GOffset $sp, %stack.1.b, 0 :: (store 32 into %ir.b, align 16) + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/settag.ll b/llvm/test/CodeGen/AArch64/settag.ll --- a/llvm/test/CodeGen/AArch64/settag.ll +++ b/llvm/test/CodeGen/AArch64/settag.ll @@ -64,8 +64,8 @@ define void @stg17(i8* %p) { entry: ; CHECK-LABEL: stg17: -; CHECK: mov {{(w|x)}}[[R:[0-9]+]], #256 ; CHECK: stg x0, [x0], #16 +; CHECK: mov {{(w|x)}}[[R:[0-9]+]], #256 ; CHECK: st2g x0, [x0], #32 ; CHECK: sub x[[R]], x[[R]], #32 ; CHECK: cbnz x[[R]], @@ -87,8 +87,8 @@ define void @stzg17(i8* %p) { entry: ; CHECK-LABEL: stzg17: -; CHECK: mov {{w|x}}[[R:[0-9]+]], #256 ; CHECK: stzg x0, [x0], #16 +; CHECK: mov {{w|x}}[[R:[0-9]+]], #256 ; CHECK: stz2g x0, [x0], #32 ; CHECK: sub x[[R]], x[[R]], #32 ; CHECK: cbnz x[[R]], @@ -110,10 +110,10 @@ define void @stg_alloca5() { entry: ; CHECK-LABEL: stg_alloca5: -; CHECK: stg sp, [sp, #64] -; CHECK: st2g sp, [sp, #32] -; CHECK: st2g sp, [sp] -; CHECK: ret +; CHECK: st2g sp, [sp, #32] +; CHECK-NEXT: stg sp, [sp, #64] +; CHECK-NEXT: st2g sp, [sp], #80 +; CHECK-NEXT: ret %a = alloca i8, i32 80, align 16 call void @llvm.aarch64.settag(i8* %a, i64 80) ret void @@ -122,12 +122,11 @@ define void @stg_alloca17() { entry: ; CHECK-LABEL: stg_alloca17: -; CHECK: mov [[P:x[0-9]+]], sp -; CHECK: stg [[P]], {{\[}}[[P]]{{\]}}, #16 ; CHECK: mov {{w|x}}[[R:[0-9]+]], #256 -; CHECK: st2g [[P]], {{\[}}[[P]]{{\]}}, #32 +; CHECK: st2g sp, [sp], #32 ; CHECK: sub x[[R]], x[[R]], #32 ; CHECK: cbnz x[[R]], +; CHECK: stg sp, [sp], #16 ; CHECK: ret %a = alloca i8, i32 272, align 16 call void @llvm.aarch64.settag(i8* %a, i64 272) diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll --- a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll +++ b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll @@ -210,11 +210,10 @@ ; DEFAULT: ldrb [[A:w.*]], [x{{.*}}] ; DEFAULT: ldrb [[B:w.*]], [x{{.*}}] -; ALWAYS: ldg [[PA:x.*]], [x{{.*}}] -; ALWAYS: ldrb [[B:w.*]], [sp] -; ALWAYS: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}} +; ALWAYS-DAG: ldg [[PA:x.*]], [x{{.*}}] +; ALWAYS-DAG: ldrb [[B:w.*]], [sp] +; ALWAYS-DAG: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}} -; COMMON: add w0, [[B]], [[A]] ; COMMON: ret ; One of these allocas is closer to FP than to SP, and within 256 bytes