diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -309,6 +309,13 @@ RegScavenger *RS = nullptr) const { } + /// processFunctionBeforeFrameIndicesReplaced - This method is called + /// immediately before MO_FrameIndex operands are eliminated, but after the + /// frame is finalized. This method is optional. + virtual void + processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, + RegScavenger *RS = nullptr) const {} + virtual unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const { report_fatal_error("WinEH not implemented for this target"); } diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -258,6 +258,10 @@ for (auto &I : EntryDbgValues) I.first->insert(I.first->begin(), I.second.begin(), I.second.end()); + // Allow the target machine to make final modifications to the function + // before the frame layout is finalized. + TFI->processFunctionBeforeFrameIndicesReplaced(MF, RS); + // Replace all MO_FrameIndex operands with physical register references // and actual offsets. // diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -349,22 +349,38 @@ MachineBasicBlock::iterator &NextMBBI) { MachineInstr &MI = *MBBI; DebugLoc DL = MI.getDebugLoc(); - Register SizeReg = MI.getOperand(2).getReg(); - Register AddressReg = MI.getOperand(3).getReg(); + Register SizeReg = MI.getOperand(0).getReg(); + Register AddressReg = MI.getOperand(1).getReg(); MachineFunction *MF = MBB.getParent(); bool ZeroData = MI.getOpcode() == AArch64::STZGloop; - const unsigned OpCode = + const unsigned OpCode1 = + ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex; + const unsigned OpCode2 = ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex; + unsigned Size = MI.getOperand(2).getImm(); + assert(Size > 0 && Size % 16 == 0); + if (Size % (16 * 2) != 0) { + BuildMI(MBB, MBBI, DL, TII->get(OpCode1), AddressReg) + .addReg(AddressReg) + .addReg(AddressReg) + .addImm(1); + Size -= 16; + } + MachineBasicBlock::iterator I = + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), SizeReg) + .addImm(Size); + expandMOVImm(MBB, I, 64); + auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); MF->insert(++MBB.getIterator(), LoopBB); MF->insert(++LoopBB->getIterator(), DoneBB); - BuildMI(LoopBB, DL, TII->get(OpCode)) + BuildMI(LoopBB, DL, TII->get(OpCode2)) .addDef(AddressReg) .addReg(AddressReg) .addReg(AddressReg) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -76,6 +76,9 @@ void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const override; + void processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, + RegScavenger *RS) const override; + unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override; unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const; @@ -106,6 +109,8 @@ int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF, int &MinCSFrameIndex, int &MaxCSFrameIndex) const; + bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB, + unsigned StackBumpBytes) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -170,6 +170,10 @@ cl::desc("reverse the CSR restore sequence"), cl::init(false), cl::Hidden); +static cl::opt StackTaggingMergeSetTag("stack-tagging-merge-settag", + cl::desc("merge settag instruction in function epilog"), + cl::init(true), cl::Hidden); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); /// This is the biggest offset to the stack pointer we can encode in aarch64 @@ -478,6 +482,39 @@ return true; } +bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue( + MachineBasicBlock &MBB, unsigned StackBumpBytes) const { + if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes)) + return false; + + if (MBB.empty()) + return true; + + // Disable combined SP bump if the last instruction is an MTE tag store. It + // is almost always better to merge SP adjustment into those instructions. + MachineBasicBlock::iterator LastI = MBB.getFirstTerminator(); + MachineBasicBlock::iterator Begin = MBB.begin(); + while (LastI != Begin) { + --LastI; + if (LastI->isTransient()) + continue; + if (!LastI->getFlag(MachineInstr::FrameDestroy)) + break; + } + switch (LastI->getOpcode()) { + case AArch64::STGloop: + case AArch64::STZGloop: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + return false; + default: + return true; + } + llvm_unreachable("unreachable"); +} + // Given a load or a store instruction, generate an appropriate unwinding SEH // code on Windows. static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, @@ -1457,7 +1494,7 @@ // function. if (MF.hasEHFunclets()) AFI->setLocalStackSize(NumBytes - PrologueSaveSize); - bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes); // Assume we can't combine the last pop with the sp restore. if (!CombineSPBump && PrologueSaveSize != 0) { @@ -2606,9 +2643,344 @@ .addImm(0); } -/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before -/// the update. This is easily retrieved as it is exactly the offset that is set -/// in processFunctionBeforeFrameFinalized. +static bool extractSTGOffsetAndSize(MachineInstr &MI, int64_t &Offset, + int64_t &Size, bool &ZeroData) { + MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + unsigned Opcode = MI.getOpcode(); + ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset || + Opcode == AArch64::STZ2GOffset); + + if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) { + if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead()) + return false; + if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI()) + return false; + Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex()); + Size = MI.getOperand(2).getImm(); + return true; + } + + if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset) + Size = 16; + else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset) + Size = 32; + else + return false; + + if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI()) + return false; + + Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) + + 16 * MI.getOperand(2).getImm(); + return true; +} + +static void FindOrCreateScratchRegisters( + MachineRegisterInfo *MRI, + const SmallVectorImpl &Instrs, unsigned *Reg1, + unsigned *Reg2) { + // Check if anything in Instrs has registers that will become dead once the + // instructions are removed. + for (auto MI : Instrs) { + if (MI->getOpcode() == AArch64::STGloop || + MI->getOpcode() == AArch64::STZGloop) { + if (Reg1) + *Reg1 = MI->getOperand(0).getReg(); + if (Reg2) + *Reg2 = MI->getOperand(1).getReg(); + return; + } + } + if (Reg1) + *Reg1 = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + if (Reg2) + *Reg2 = MRI->createVirtualRegister(&AArch64::GPR64RegClass); +} + +static void EmitUnrolledSetTag( + MachineBasicBlock::iterator II, const AArch64FrameLowering *TFI, + int64_t StartOffset, int64_t Size, + const SmallVectorImpl &Instrs, bool ZeroData) { + DebugLoc DL = Instrs[0]->getDebugLoc(); + MachineBasicBlock *MBB = II->getParent(); + MachineFunction &MF = *MBB->getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + /*FIXME: check full range*/ + assert(StartOffset % 16 == 0); + + const int64_t kMinOffset = -256 * 16; + const int64_t kMaxOffset = 255 * 16; + unsigned BaseReg; + StackOffset BaseRegOffset = TFI->resolveFrameOffsetReference( + MF, StartOffset, false /*isFixed*/, false /*isSVE*/, BaseReg, + /*PreferFP=*/false, /*ForSimm=*/true); + int64_t BaseRegOffsetBytes = BaseRegOffset.getBytes(); + if (BaseRegOffsetBytes < kMinOffset || + BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) { + unsigned ScratchReg; + FindOrCreateScratchRegisters(&MF.getRegInfo(), Instrs, &ScratchReg, + nullptr); + emitFrameOffset(*MBB, II, DL, ScratchReg, BaseReg, BaseRegOffset, TII); + BaseReg = ScratchReg; + BaseRegOffsetBytes = 0; + } + + MachineInstr *LastI = nullptr; + while (Size) { + int64_t InstrSize = (Size > 16) ? 32 : 16; + unsigned Opcode = + InstrSize == 16 + ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset) + : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset); + MachineInstr *I = + BuildMI(*MBB, II, DL, TII->get(Opcode)) + .addReg(AArch64::SP) + .addReg(BaseReg) + .addImm(BaseRegOffsetBytes / 16) + .cloneMergedMemRefs(Instrs); // what about other instructions flags? + // A store to [BaseReg, #0] should go last for an opportunity to fold the + // final SP adjustment in the epilogue. + if (BaseRegOffsetBytes == 0) + LastI = I; + BaseRegOffsetBytes += InstrSize; + Size -= InstrSize; + } + + if (LastI) + MBB->splice(II, MBB, LastI); +} + +// Check if *II is a register update that can be merged into STGloop that ends +// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the +// end of the loop. +static bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg, + int64_t Size, int64_t *RemainingOffset) { + MachineInstr &MI = *II; + if ((MI.getOpcode() == AArch64::ADDXri || + MI.getOpcode() == AArch64::SUBXri) && + MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) { + unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm()); + int64_t Offset = MI.getOperand(2).getImm() << Shift; + if (MI.getOpcode() == AArch64::SUBXri) + Offset = -Offset; + Offset -= Size; + int64_t AbsOffset = std::abs(Offset); + const int64_t kMaxOffset = + 0xFFF; // Max encoding for unshifted ADDXri / SUBXri + if (AbsOffset <= kMaxOffset && AbsOffset % 16 == 0) { + *RemainingOffset = Offset; + return true; + } + } + *RemainingOffset = 0; + return false; +} + +// Emit an STG loop starting at FrameReg + FrameRegOffset of length Size, adding +// an extra RemainingOffset to FrameReg at the end. +static void EmitSetTagLoopWithUpdate( + MachineBasicBlock::iterator InsertI, const AArch64FrameLowering *TFI, + unsigned FrameReg, StackOffset FrameRegOffset, int64_t Size, + int64_t RemainingOffset, unsigned Flags, + const SmallVectorImpl &Instrs, bool ZeroData) { + DebugLoc DL = Instrs[0]->getDebugLoc(); + MachineBasicBlock *MBB = InsertI->getParent(); + MachineFunction &MF = *MBB->getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + + unsigned Reg1; + FindOrCreateScratchRegisters(&MF.getRegInfo(), Instrs, &Reg1, nullptr); + + emitFrameOffset(*MBB, InsertI, DL, FrameReg, FrameReg, FrameRegOffset, TII); + BuildMI(*MBB, InsertI, DL, + TII->get(ZeroData ? AArch64::STZGloop : AArch64::STGloop)) + .addDef(Reg1) + .addDef(FrameReg) + .addImm(Size - Size % 32) + .addReg(FrameReg) + .cloneMergedMemRefs(Instrs) + .setMIFlags(Flags); + if (Size % 32 != 0 && RemainingOffset) { + BuildMI(*MBB, InsertI, DL, + TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex)) + .addDef(FrameReg) + .addReg(FrameReg) + .addReg(FrameReg) + .addImm(RemainingOffset / 16 + 1) + .cloneMergedMemRefs(Instrs) + .setMIFlags(Flags); + } else if (Size % 32 != 0) { + BuildMI(*MBB, InsertI, DL, + TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex)) + .addDef(FrameReg) + .addReg(FrameReg) + .addReg(FrameReg) + .addImm(1) + .cloneMergedMemRefs(Instrs) + .setMIFlags(Flags); + } else if (RemainingOffset) { + BuildMI(*MBB, InsertI, DL, + TII->get(RemainingOffset > 0 ? AArch64::ADDXri : AArch64::SUBXri)) + .addDef(FrameReg) + .addReg(FrameReg) + .addImm(std::abs(RemainingOffset)) + .addImm(0) + .setMIFlags(Flags); + } +} + +static void EmitSetTagLoop(MachineBasicBlock::iterator InsertI, + const AArch64FrameLowering *TFI, unsigned FrameReg, + StackOffset FrameRegOffset, int64_t Size, + const SmallVectorImpl &Instrs, + bool ZeroData) { + DebugLoc DL = Instrs[0]->getDebugLoc(); + MachineBasicBlock *MBB = InsertI->getParent(); + MachineFunction &MF = *MBB->getParent(); + const AArch64InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + + unsigned Reg1, Reg2; + FindOrCreateScratchRegisters(&MF.getRegInfo(), Instrs, &Reg1, &Reg2); + + emitFrameOffset(*MBB, InsertI, DL, Reg2, FrameReg, FrameRegOffset, TII); + BuildMI(*MBB, InsertI, DL, + TII->get(ZeroData ? AArch64::STZGloop : AArch64::STGloop)) + .addDef(Reg1) + .addDef(Reg2) + .addImm(Size) + .addReg(Reg2) + .cloneMergedMemRefs(Instrs); +} + +struct TagStoreInstr { + MachineInstr *MI; + int64_t Offset, Size; + explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size) + : MI(MI), Offset(Offset), Size(Size) {} +}; + +// Detect a run of memory tagging instructions for adjacent stack frame slots, +// and replace them with a shorter instruction sequence: +// * replace STG + STG with ST2G +// * replace STGloop + STGloop with STGloop +// This code needs to run when stack slot offsets are already known, but before +// FrameIndex operands in STG instructions are eliminated. +static MachineBasicBlock::iterator +tryMergeAdjacentSTG(MachineBasicBlock::iterator II, + const AArch64FrameLowering *TFI, RegScavenger *RS) { + bool FirstZeroData; + int64_t Size, Offset; + MachineBasicBlock::iterator InsertI = II; + MachineInstr &MI = *II; + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator NextI = ++II; + if (&MI == &MBB->instr_back()) + return II; + if (!extractSTGOffsetAndSize(MI, Offset, Size, FirstZeroData)) + return II; + + MachineFunction &MF = *MI.getParent()->getParent(); + SmallVector Instrs; + Instrs.emplace_back(&MI, Offset, Size); + + DebugLoc DL = MI.getDebugLoc(); + for (MachineBasicBlock::iterator E = MBB->end(); NextI != E; ++NextI) { + MachineInstr &MI = *NextI; + if (MI.isTransient()) + continue; + bool ZeroData; + int64_t Size, Offset; + if (!extractSTGOffsetAndSize(MI, Offset, Size, ZeroData)) + break; + if (ZeroData != FirstZeroData) + break; + Instrs.emplace_back(&MI, Offset, Size); + } + + llvm::stable_sort(Instrs, + [](const TagStoreInstr &Left, const TagStoreInstr &Right) { + return Left.Offset < Right.Offset; + }); + + int64_t CurOffset = Instrs[0].Offset; + for (auto &Instr : Instrs) { + if (CurOffset != Instr.Offset) + return NextI; + CurOffset += Instr.Size; + } + + LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n"; + for (auto &Instr + : Instrs) { dbgs() << " " << *Instr.MI; }); + + int64_t StartOffset = Instrs[0].Offset; + int64_t FullSize = CurOffset - StartOffset; + + SmallVector InstrList; + for (auto &Instr : Instrs) { + InstrList.push_back(Instr.MI); + } + + LLVM_DEBUG(dbgs() << "Frame offset: " << StartOffset + << ", total size: " << FullSize << "\n"); + + const int kSetTagLoopThreshold = 176; + if (FullSize < kSetTagLoopThreshold) { + if (Instrs.size() < 2) + return II; + EmitUnrolledSetTag(InsertI, TFI, StartOffset, FullSize, InstrList, + FirstZeroData); + } else { + unsigned FrameReg; + StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference( + MF, StartOffset, false /*isFixed*/, false /*isSVE*/, FrameReg, + /*PreferFP=*/false, /*ForSimm=*/true); + int64_t RemainingOffset = 0; + // See if we can merge base register update into the STGloop. + // This is done in AArch64LoadStoreOptimizer for "normal" stores, + // but STGloop is way too unusual for that, and also it only + // realistically happens in function epilogue. Also, STGloop is expanded + // before that pass. + if (NextI != MBB->end() && + canMergeRegUpdate(NextI, FrameReg, FrameRegOffset.getBytes() + FullSize, + &RemainingOffset)) { + MachineInstr *UpdateInstr = &*NextI++; + LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n " << *UpdateInstr); + + EmitSetTagLoopWithUpdate(InsertI, TFI, FrameReg, FrameRegOffset, FullSize, + RemainingOffset, UpdateInstr->getFlags(), + InstrList, FirstZeroData); + UpdateInstr->eraseFromParent(); + } else { + if (Instrs.size() < 2) + return II; + EmitSetTagLoop(InsertI, TFI, FrameReg, FrameRegOffset, FullSize, + InstrList, FirstZeroData); + } + } + + for (auto &Instr : Instrs) + Instr.MI->eraseFromParent(); + + return NextI; +} + +void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( + MachineFunction &MF, RegScavenger *RS = nullptr) const { + if (StackTaggingMergeSetTag) + for (auto &BB : MF) + for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) + II = tryMergeAdjacentSTG(II, this, RS); +} + +/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP +/// before the update. This is easily retrieved as it is exactly the offset +/// that is set in processFunctionBeforeFrameFinalized. int AArch64FrameLowering::getFrameIndexReferencePreferSP( const MachineFunction &MF, int FI, unsigned &FrameReg, bool IgnoreSPUpdates) const { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3396,6 +3396,8 @@ case AArch64::ST1Fourv1d: case AArch64::IRG: case AArch64::IRGstack: + case AArch64::STGloop: + case AArch64::STZGloop: return AArch64FrameOffsetCannotUpdate; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1457,13 +1457,13 @@ // $Rn_wback is one past the end of the range. let isCodeGenOnly=1, mayStore=1 in { def STGloop - : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn), - [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >, + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >, Sched<[WriteAdr, WriteST]>; def STZGloop - : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn), - [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >, + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >, Sched<[WriteAdr, WriteST]>; } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -124,7 +124,6 @@ unsigned getLocalAddressRegister(const MachineFunction &MF) const; }; - } // end namespace llvm #endif diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -390,6 +390,10 @@ if (isFrameOffsetLegal(MI, AArch64::SP, Offset)) return false; + // If even offset 0 is illegal, we don't want a virtual base register. + if (!isFrameOffsetLegal(MI, AArch64::SP, 0)) + return false; + // The offset likely isn't legal; we want to allocate a virtual base register. return true; } @@ -446,6 +450,17 @@ (void)Done; } + +static Register getScratchRegisterForInstruction(MachineInstr &MI) { + // ST*Gloop can only have #fi in op3, and they have a constraint that op1==op3. + // Use op1 as a scratch register. + if (MI.getOpcode() == AArch64::STGloop || MI.getOpcode() == AArch64::STZGloop) + return MI.getOperand(1).getReg(); + else + return MI.getMF()->getRegInfo().createVirtualRegister( + &AArch64::GPR64RegClass); +} + void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { @@ -502,8 +517,7 @@ // in a scratch register. Offset = TFI->resolveFrameIndexReference( MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true); - Register ScratchReg = - MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + Register ScratchReg = getScratchRegisterForInstruction(MI); emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg) @@ -532,8 +546,7 @@ // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above. Handle the rest, providing a register that is // SP+LargeImm. - Register ScratchReg = - MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + Register ScratchReg = getScratchRegisterForInstruction(MI); emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); } diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -125,19 +125,13 @@ return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand, ZeroData); - if (ObjSize % 32 != 0) { - SDNode *St1 = DAG.getMachineNode( - ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl, - {MVT::i64, MVT::Other}, - {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain}); - DAG.setNodeMemRefs(cast(St1), {BaseMemOperand}); - ObjSize -= 16; - Addr = SDValue(St1, 0); - Chain = SDValue(St1, 1); - } - const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other}; - SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain}; + + if (Addr.getOpcode() == ISD::FrameIndex) { + int FI = cast(Addr)->getIndex(); + Addr = DAG.getTargetFrameIndex(FI, MVT::i64); + } + SDValue Ops[] = {DAG.getTargetConstant(ObjSize, dl, MVT::i64), Addr, Chain}; SDNode *St = DAG.getMachineNode( ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops); diff --git a/llvm/test/CodeGen/AArch64/settag-merge.ll b/llvm/test/CodeGen/AArch64/settag-merge.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/settag-merge.ll @@ -0,0 +1,168 @@ +; RUN: llc < %s -mtriple=aarch64 -mattr=+mte | FileCheck %s + +declare void @llvm.aarch64.settag(i8* %p, i64 %a) +declare void @llvm.aarch64.settag.zero(i8* %p, i64 %a) + +define void @stg16_16() { +entry: +; CHECK-LABEL: stg16_16: +; CHECK: st2g sp, [sp], #32 +; CHECK: ret + %a = alloca i8, i32 16, align 16 + %b = alloca i8, i32 16, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 16) + call void @llvm.aarch64.settag(i8* %b, i64 16) + ret void +} + +define void @stg16_16_16_16() { +entry: +; CHECK-LABEL: stg16_16_16_16: +; CHECK: st2g sp, [sp, #32] +; CHECK: st2g sp, [sp], #64 +; CHECK: ret + %a = alloca i8, i32 16, align 16 + %b = alloca i8, i32 16, align 16 + %c = alloca i8, i32 16, align 16 + %d = alloca i8, i32 16, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 16) + call void @llvm.aarch64.settag(i8* %b, i64 16) + call void @llvm.aarch64.settag(i8* %c, i64 16) + call void @llvm.aarch64.settag(i8* %d, i64 16) + ret void +} + +define void @stg128_128_128_128() { +entry: +; CHECK-LABEL: stg128_128_128_128: +; CHECK: mov x8, #512 +; CHECK: st2g sp, [sp], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: ret + %a = alloca i8, i32 128, align 16 + %b = alloca i8, i32 128, align 16 + %c = alloca i8, i32 128, align 16 + %d = alloca i8, i32 128, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 128) + call void @llvm.aarch64.settag(i8* %b, i64 128) + call void @llvm.aarch64.settag(i8* %c, i64 128) + call void @llvm.aarch64.settag(i8* %d, i64 128) + ret void +} + +define void @stg16_512_16() { +entry: +; CHECK-LABEL: stg16_512_16: +; CHECK: mov x8, #544 +; CHECK: st2g sp, [sp], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: ret + %a = alloca i8, i32 16, align 16 + %b = alloca i8, i32 512, align 16 + %c = alloca i8, i32 16, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 16) + call void @llvm.aarch64.settag(i8* %b, i64 512) + call void @llvm.aarch64.settag(i8* %c, i64 16) + ret void +} + +define void @stg512_512_512() { +entry: +; CHECK-LABEL: stg512_512_512: +; CHECK: mov x8, #1536 +; CHECK: st2g sp, [sp], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: ret + %a = alloca i8, i32 512, align 16 + %b = alloca i8, i32 512, align 16 + %c = alloca i8, i32 512, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 512) + call void @llvm.aarch64.settag(i8* %b, i64 512) + call void @llvm.aarch64.settag(i8* %c, i64 512) + ret void +} + +define void @early(i1 %flag) { +entry: +; CHECK-LABEL: early: +; CHECK: tbz w0, #0, [[LABEL:.LBB.*]] +; CHECK: st2g sp, [sp, # +; CHECK: st2g sp, [sp, # +; CHECK: st2g sp, [sp, # +; CHECK: [[LABEL]]: +; CHECK: stg sp, [sp, # +; CHECK: st2g sp, [sp], # +; CHECK: ret + %a = alloca i8, i32 48, align 16 + %b = alloca i8, i32 48, align 16 + %c = alloca i8, i32 48, align 16 + br i1 %flag, label %if.then, label %if.end + +if.then: + call void @llvm.aarch64.settag(i8* %a, i64 48) + call void @llvm.aarch64.settag(i8* %b, i64 48) + br label %if.end + +if.end: + call void @llvm.aarch64.settag(i8* %c, i64 48) + ret void +} + +define void @early_128_128(i1 %flag) { +entry: +; CHECK-LABEL: early_128_128: +; CHECK: tbz w0, #0, [[LABEL:.LBB.*]] +; CHECK: add x9, sp, # +; CHECK: mov x8, #256 +; CHECK: st2g x9, [x9], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: [[LABEL]]: +; CHECK: stg sp, [sp, # +; CHECK: st2g sp, [sp], # +; CHECK: ret + %a = alloca i8, i32 128, align 16 + %b = alloca i8, i32 128, align 16 + %c = alloca i8, i32 48, align 16 + br i1 %flag, label %if.then, label %if.end + +if.then: + call void @llvm.aarch64.settag(i8* %a, i64 128) + call void @llvm.aarch64.settag(i8* %b, i64 128) + br label %if.end + +if.end: + call void @llvm.aarch64.settag(i8* %c, i64 48) + ret void +} + +define void @early_512_512(i1 %flag) { +entry: +; CHECK-LABEL: early_512_512: +; CHECK: tbz w0, #0, [[LABEL:.LBB.*]] +; CHECK: add x9, sp, # +; CHECK: mov x8, #1024 +; CHECK: st2g x9, [x9], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: [[LABEL]]: +; CHECK: stg sp, [sp, # +; CHECK: st2g sp, [sp], # +; CHECK: ret + %a = alloca i8, i32 512, align 16 + %b = alloca i8, i32 512, align 16 + %c = alloca i8, i32 48, align 16 + br i1 %flag, label %if.then, label %if.end + +if.then: + call void @llvm.aarch64.settag(i8* %a, i64 512) + call void @llvm.aarch64.settag(i8* %b, i64 512) + br label %if.end + +if.end: + call void @llvm.aarch64.settag(i8* %c, i64 48) + ret void +} diff --git a/llvm/test/CodeGen/AArch64/settag.ll b/llvm/test/CodeGen/AArch64/settag.ll --- a/llvm/test/CodeGen/AArch64/settag.ll +++ b/llvm/test/CodeGen/AArch64/settag.ll @@ -64,8 +64,8 @@ define void @stg17(i8* %p) { entry: ; CHECK-LABEL: stg17: -; CHECK: mov {{(w|x)}}[[R:[0-9]+]], #256 ; CHECK: stg x0, [x0], #16 +; CHECK: mov {{(w|x)}}[[R:[0-9]+]], #256 ; CHECK: st2g x0, [x0], #32 ; CHECK: sub x[[R]], x[[R]], #32 ; CHECK: cbnz x[[R]], @@ -87,8 +87,8 @@ define void @stzg17(i8* %p) { entry: ; CHECK-LABEL: stzg17: -; CHECK: mov {{w|x}}[[R:[0-9]+]], #256 ; CHECK: stzg x0, [x0], #16 +; CHECK: mov {{w|x}}[[R:[0-9]+]], #256 ; CHECK: stz2g x0, [x0], #32 ; CHECK: sub x[[R]], x[[R]], #32 ; CHECK: cbnz x[[R]], @@ -110,10 +110,10 @@ define void @stg_alloca5() { entry: ; CHECK-LABEL: stg_alloca5: -; CHECK: stg sp, [sp, #64] -; CHECK: st2g sp, [sp, #32] -; CHECK: st2g sp, [sp] -; CHECK: ret +; CHECK: st2g sp, [sp, #32] +; CHECK-NEXT: stg sp, [sp, #64] +; CHECK-NEXT: st2g sp, [sp], #80 +; CHECK-NEXT: ret %a = alloca i8, i32 80, align 16 call void @llvm.aarch64.settag(i8* %a, i64 80) ret void @@ -122,12 +122,11 @@ define void @stg_alloca17() { entry: ; CHECK-LABEL: stg_alloca17: -; CHECK: mov [[P:x[0-9]+]], sp -; CHECK: stg [[P]], {{\[}}[[P]]{{\]}}, #16 ; CHECK: mov {{w|x}}[[R:[0-9]+]], #256 -; CHECK: st2g [[P]], {{\[}}[[P]]{{\]}}, #32 +; CHECK: st2g sp, [sp], #32 ; CHECK: sub x[[R]], x[[R]], #32 ; CHECK: cbnz x[[R]], +; CHECK: stg sp, [sp], #16 ; CHECK: ret %a = alloca i8, i32 272, align 16 call void @llvm.aarch64.settag(i8* %a, i64 272)