Index: llvm/lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -3234,7 +3234,7 @@ // instructions. May skip if the replacement is not profitable. May invalidate // the input iterator and replace it with a valid one. void emitCode(MachineBasicBlock::iterator &InsertI, - const AArch64FrameLowering *TFI, bool IsLast); + const AArch64FrameLowering *TFI, bool TryMergeSPUpdate); }; void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) { @@ -3373,7 +3373,8 @@ } void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, - const AArch64FrameLowering *TFI, bool IsLast) { + const AArch64FrameLowering *TFI, + bool TryMergeSPUpdate) { if (TagStores.empty()) return; TagStoreInstr &FirstTagStore = TagStores[0]; @@ -3403,8 +3404,8 @@ emitUnrolled(InsertI); } else { MachineInstr *UpdateInstr = nullptr; - int64_t TotalOffset; - if (IsLast) { + int64_t TotalOffset = 0; + if (TryMergeSPUpdate) { // See if we can merge base register update into the STGloop. // This is done in AArch64LoadStoreOptimizer for "normal" stores, // but STGloop is way too unusual for that, and also it only @@ -3549,7 +3550,7 @@ for (auto &Instr : Instrs) { if (EndOffset && *EndOffset != Instr.Offset) { // Found a gap. - TSE.emitCode(InsertI, TFI, /*IsLast = */ false); + TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false); TSE.clear(); } @@ -3557,7 +3558,11 @@ EndOffset = Instr.Offset + Instr.Size; } - TSE.emitCode(InsertI, TFI, /*IsLast = */ true); + // Multiple FP/SP updates in a loop cannot be described by CFI instructions. + TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ + !MBB->getParent() + ->getInfo() + ->needsAsyncDwarfUnwindInfo()); return InsertI; } Index: llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -589,23 +589,31 @@ // Create a scratch register for the frame index elimination in an instruction. // This function has special handling of stack tagging loop pseudos, in which -// case it can also change the instruction opcode (but not the operands). +// case it can also change the instruction opcode. static Register -createScratchRegisterForInstruction(MachineInstr &MI, +createScratchRegisterForInstruction(MachineInstr &MI, unsigned FIOperandNum, const AArch64InstrInfo *TII) { // ST*Gloop have a reserved scratch register in operand 1. Use it, and also // replace the instruction with the writeback variant because it will now // satisfy the operand constraints for it. - if (MI.getOpcode() == AArch64::STGloop) { - MI.setDesc(TII->get(AArch64::STGloop_wback)); - return MI.getOperand(1).getReg(); - } else if (MI.getOpcode() == AArch64::STZGloop) { - MI.setDesc(TII->get(AArch64::STZGloop_wback)); - return MI.getOperand(1).getReg(); + Register ScratchReg; + if (MI.getOpcode() == AArch64::STGloop || + MI.getOpcode() == AArch64::STZGloop) { + assert(FIOperandNum == 3 && + "Wrong frame index operand for STGloop/STZGloop"); + unsigned Op = MI.getOpcode() == AArch64::STGloop ? AArch64::STGloop_wback + : AArch64::STZGloop_wback; + ScratchReg = MI.getOperand(1).getReg(); + MI.getOperand(3).ChangeToRegister(ScratchReg, false, false, true); + MI.setDesc(TII->get(Op)); + MI.tieOperands(1, 3); } else { - return MI.getMF()->getRegInfo().createVirtualRegister( - &AArch64::GPR64RegClass); + ScratchReg = + MI.getMF()->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + MI.getOperand(FIOperandNum) + .ChangeToRegister(ScratchReg, false, false, true); } + return ScratchReg; } void AArch64RegisterInfo::getOffsetOpcodes( @@ -722,9 +730,9 @@ // If we get here, the immediate doesn't fit into the instruction. We folded // as much as possible above. Handle the rest, providing a register that is // SP+LargeImm. - Register ScratchReg = createScratchRegisterForInstruction(MI, TII); + Register ScratchReg = + createScratchRegisterForInstruction(MI, FIOperandNum, TII); emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); - MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); } unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, Index: llvm/test/CodeGen/AArch64/settag.ll =================================================================== --- llvm/test/CodeGen/AArch64/settag.ll +++ llvm/test/CodeGen/AArch64/settag.ll @@ -146,14 +146,12 @@ ret void } -define void @stg_alloca17() uwtable { +define void @stg_alloca17() nounwind { ; CHECK-LABEL: stg_alloca17: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #288 -; CHECK-NEXT: .cfi_def_cfa_offset 288 -; CHECK-NEXT: str x29, [sp, #272] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: mov x8, #256 +; CHECK-NEXT: str x29, [sp, #272] // 8-byte Folded Spill ; CHECK-NEXT: .LBB11_1: // %entry ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: st2g sp, [sp], #32 @@ -161,6 +159,31 @@ ; CHECK-NEXT: cbnz x8, .LBB11_1 ; CHECK-NEXT: // %bb.2: // %entry ; CHECK-NEXT: stg sp, [sp], #16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %a = alloca i8, i32 272, align 16 + call void @llvm.aarch64.settag(i8* %a, i64 272) + ret void +} + +define void @stg_alloca18() uwtable { +; CHECK-LABEL: stg_alloca18: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #288 +; CHECK-NEXT: .cfi_def_cfa_offset 288 +; CHECK-NEXT: str x29, [sp, #272] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mov x8, #256 +; CHECK-NEXT: stg x9, [x9], #16 +; CHECK-NEXT: .LBB12_1: // %entry +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub x8, x8, #32 +; CHECK-NEXT: st2g x9, [x9], #32 +; CHECK-NEXT: cbnz x8, .LBB12_1 +; CHECK-NEXT: // %bb.2: // %entry +; CHECK-NEXT: add sp, sp, #272 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0