This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
2
AArch64ISelLowering.cpp
-
GISel/
-
AArch64InstructionSelector.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
2
aarch64-signedreturnaddress.ll

Differential D156358

[AArch64] Do not unnecessarily spill LR because of @llvm.returnaddress
Needs ReviewPublic

Authored by atrosinenko on Jul 26 2023, 12:08 PM.

Download Raw Diff

Details

Reviewers

ab
kristof.beyls
apazos
pcc
psmith
t.p.northover
arsenm
MatzeB

Summary

When lowering @llvm.returnaddress with FEAT_PAuth, move possibly signed
return address out of LR first, to prevent XPACI from clobbering the LR
when it would not be spilled otherwise.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

atrosinenko created this revision.Jul 26 2023, 12:08 PM

Herald added a project: Restricted Project. · View Herald TranscriptJul 26 2023, 12:08 PM

Herald added a subscriber: hiraditya. · View Herald Transcript

While most features of https://github.com/apple/llvm-project/commit/7924c7d75ae0015a9fd9786a580b10b2190bccc6 are already implemented in mainline, @llvm.returnaddress still tends to be lowered like

xpaci x30
mov dest, x30

instead of

mov dest, x30
xpaci dest

This patch forces the return address to be copied out of LR first, if FEAT_PAuth is avaialble (at the instruction selection phase).

Herald added a project: Restricted Project. · View Herald TranscriptJul 26 2023, 12:26 PM

Herald added a subscriber: llvm-commits. · View Herald Transcript

efriedma added a subscriber: efriedma.Jul 26 2023, 12:52 PM

efriedma added inline comments.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
9729	We normally shouldn't be creating virtual registers here; isel lowering will create copies when necessary. Maybe this is getting weird because of the use of getMachineNode()? Normally, we shouldn't be using getMachineNode() during ISelLowering; we should use something like `DAG.getNode(AArch64ISD::XPACI, [...]`, and lower that to the actual instruction during ISelDAGToDAG.

Harbormaster completed remote builds in B248337: Diff 544467.Jul 26 2023, 5:07 PM

After debugging a bit more, I suspect that the actual reason for emitting XPACI LR is not the instruction selection phase but register allocation.

Here is a simplified returnaddress.ll test case:

define ptr @ra0() nounwind readnone {
entry:
  %0 = tail call ptr @llvm.returnaddress(i32 0)
  ret ptr %0
}

declare ptr @llvm.returnaddress(i32) nounwind readnone

On a rather recent mainline LLVM commit, I run the following:

./bin/llc < returnaddress.ll -mtriple=arm64-eabi -mattr=v8.3a -print-after-all -debug-only=regalloc

Transition from LLVM IR to MIR looks good:

*** IR Dump After Module Verifier (verify) ***
; Function Attrs: nounwind memory(none)
define ptr @ra0() #0 {
entry:
  %0 = tail call ptr @llvm.returnaddress(i32 0)
  ret ptr %0
}
# *** IR Dump After AArch64 Instruction Selection (aarch64-isel) ***:
# Machine code for function ra0: IsSSA, TracksLiveness
Function Live Ins: $lr in %0

bb.0.entry:
  liveins: $lr
  %0:gpr64 = COPY $lr
  %1:gpr64 = XPACI %0:gpr64(tied-def 0)
  $x0 = COPY %1:gpr64
  RET_ReallyLR implicit $x0

# End machine code for function ra0.

The operand of XPACI is a virtual register. Then, at register allocation time, the following is printed:

# *** IR Dump After Live Register Matrix (liveregmatrix) ***:
# Machine code for function ra0: NoPHIs, TracksLiveness, TiedOpsRewritten, TracksDebugUserValues
Function Live Ins: $lr in %0

0B	bb.0.entry:
	  liveins: $lr
16B	  %1:gpr64 = COPY $lr
48B	  %1:gpr64 = XPACI %1:gpr64(tied-def 0)
64B	  $x0 = COPY %1:gpr64
80B	  RET_ReallyLR implicit killed $x0

# End machine code for function ra0.

********** GREEDY REGISTER ALLOCATION **********
********** Function: ra0
********** INTERVALS **********
W30 [0B,16r:0) 0@0B-phi
%1 [16r,48r:0)[48r,64r:1) 0@16r 1@48r  weight:INF
RegMasks:
********** MACHINEINSTRS **********
# Machine code for function ra0: NoPHIs, TracksLiveness, TiedOpsRewritten, TracksDebugUserValues
Function Live Ins: $lr in %0

0B	bb.0.entry:
	  liveins: $lr
16B	  %1:gpr64 = COPY $lr
48B	  %1:gpr64 = XPACI %1:gpr64(tied-def 0)
64B	  $x0 = COPY %1:gpr64
80B	  RET_ReallyLR implicit killed $x0

# End machine code for function ra0.

Enqueuing %1
AllocationOrder(GPR64) = [ $x8 $x9 $x10 $x11 $x12 $x13 $x14 $x15 $x16 $x17 $x18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x19 $x20 $x21 $x22 $x23 $x24 $x25 $x26 $x27 $x28 $fp $lr ]

selectOrSplit GPR64:%1 [16r,48r:0)[48r,64r:1) 0@16r 1@48r  weight:INF w=INF
hints: $lr $x0
assigning %1 to $lr: W30 [16r,48r:0)[48r,64r:1) 0@16r 1@48r
# *** IR Dump After Greedy Register Allocator (greedy) ***:
# Machine code for function ra0: NoPHIs, TracksLiveness, TiedOpsRewritten, TracksDebugUserValues
Function Live Ins: $lr in %0

0B	bb.0.entry:
	  liveins: $lr
16B	  %1:gpr64 = COPY $lr
48B	  %1:gpr64 = XPACI %1:gpr64(tied-def 0)
64B	  $x0 = COPY %1:gpr64
80B	  RET_ReallyLR implicit killed $x0

# End machine code for function ra0.

********** REWRITE VIRTUAL REGISTERS **********
********** Function: ra0
********** REGISTER MAP **********
[%1 -> $lr] GPR64

0B	bb.0.entry:
	  liveins: $lr
16B	  %1:gpr64 = COPY $lr
48B	  %1:gpr64 = XPACI killed %1:gpr64(tied-def 0)
64B	  $x0 = COPY killed %1:gpr64
80B	  RET_ReallyLR implicit killed $x0
> renamable $lr = COPY $lr
Identity copy: renamable $lr = COPY $lr
  deleted.
> renamable $lr = XPACI killed renamable $lr(tied-def 0)
> $x0 = COPY killed renamable $lr
> RET_ReallyLR implicit killed $x0
# *** IR Dump After Virtual Register Rewriter (virtregrewriter) ***:
# Machine code for function ra0: NoPHIs, TracksLiveness, NoVRegs, TiedOpsRewritten, TracksDebugUserValues
Function Live Ins: $lr

0B	bb.0.entry:
	  liveins: $lr
48B	  renamable $lr = XPACI killed renamable $lr(tied-def 0)
64B	  $x0 = COPY killed renamable $lr
80B	  RET_ReallyLR implicit killed $x0

# End machine code for function ra0.

Note the hints: $lr $x0 line. I believe that allocating $lr as %1 is not expected because it is clobbered by XPACI but then needed to return from the function. As far as I understand, the reason is that RET_ReallyLR instruction hides its usage of $lr, so it is considered killed in the above line.

I tried overriding AArch64RegisterInfo::getRegAllocationHints method like this

bool AArch64RegisterInfo::getRegAllocationHints(
    Register VirtReg, ArrayRef<MCPhysReg> Order,
    SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
    const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
  bool Result = TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
                                                          MF, VRM, Matrix);
  erase_if(Hints, [](MCPhysReg Reg) { return Reg == AArch64::LR; });
  return Result && !Hints.empty();
}

And check-llvm passes (with the updated test from this patch) - that is, both with global-isel disabled and enabled. On the other hand, the RET_ReallyLR instruction was introduced on purpose and I am not yet sure if such fix is appropriate.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
9729	Thank you for the suggestion, this reload to another register class looked awkward to me, too.

That makes more sense... regalloc sees that you have an operation where the operand and result have to be in the same register, and that register is allocatable, so it doesn't matter if we clobber the value. Which is right, except that it forces LR to be spilled when the function wouldn't need any spills.

I guess the extra copies introduced by this patch trick the register allocator into not doing that?

I'm not sure what the best way to introduce a hint for that is; maybe setRegAllocationHint() is more appropriate?

Herald added a subscriber: wdng. · View Herald TranscriptJul 28 2023, 1:58 PM

We tried to address this problem by adding an XPACIuntied pseudo, only selecting that instead of the real instruction, and later (after regalloc, which is now unburdened by the tied nature of XPAC, like you're discussing) expanding it to add the mov if necessary:

https://github.com/ahmedbougacha/llvm-project/commit/7924c7d75ae0015a9fd9786a580b10b2190bccc6

@atrosinenko do you want to try out that patch, see if it helps?

https://github.com/ahmedbougacha/llvm-project/commit/7924c7d75ae0015a9fd9786a580b10b2190bccc6

@atrosinenko do you want to try out that patch, see if it helps?

Actually, this patch is inspired by the above commit (the commit mentioned in my first message is the same, it was just referenced through apple/llvm-project). Turned out, most of its features were previously reimplemented in mainline in a way similar to your implementation but surprisingly not preventing your patch from being applied. The optimization of @llvm.returnaddress lowering was among a few features of 7924c7d75a not yet existing in mainline (the second feature is Darwin-specific optimization). In 7924c7d75a rebased onto recent mainline commit, it seems to have a minor issue of emitting extra movs sometimes, like this:

_test_returnaddress_1:                  ; @test_returnaddress_1
; %bb.0:
        stp     x29, x30, [sp, #-16]!           ; 16-byte Folded Spill
        mov     x29, sp
        ldr     x8, [x29]
        ldr     x8, [x8, #8]
        mov     x0, x8
        xpaci   x0
        ldp     x29, x30, [sp], #16             ; 16-byte Folded Reload
        ret

Anyway, now I wonder whether we should not touch @llvm.returnaddress lowering at all and adjust register allocation somehow (RET_ReallyLR definition, getRegAllocationHints method, something else...).

kristof.beyls added inline comments.Aug 4 2023, 12:27 AM

llvm/test/CodeGen/AArch64/aarch64-signedreturnaddress.ll
6	nit: you might be able to spell-correct intsruction -> instruction here while touching this file anyway?
23	Since this function doesn't sign return addresses, the xpaci is not needed, IIUC? Probably that is yet another optimization that could be implemented. Maybe it's worthwhile to add a FIXME for this here or in AArch64TargetLowering::LowerRETURNADDR?

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

8 lines

GISel/

AArch64InstructionSelector.cpp

5 lines

test/

CodeGen/

AArch64/

aarch64-signedreturnaddress.ll

89 lines

Diff 544467

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 9,714 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
}		}

// The XPACLRI instruction assembles to a hint-space instruction before		// The XPACLRI instruction assembles to a hint-space instruction before
// Armv8.3-A therefore this instruction can be safely used for any pre		// Armv8.3-A therefore this instruction can be safely used for any pre
// Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use		// Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
// that instead.		// that instead.
SDNode *St;		SDNode *St;
if (Subtarget->hasPAuth()) {		if (Subtarget->hasPAuth()) {
St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);		MachineRegisterInfo &MRI = MF.getRegInfo();
		Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64noipRegClass);

		SDValue Chain =
		DAG.getCopyToReg(DAG.getEntryNode(), DL, TmpReg, ReturnAddress);
		SDValue ReturnAddressCopy = DAG.getCopyFromReg(Chain, DL, TmpReg, VT);
		St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddressCopy);
		efriedmaUnsubmitted Not Done Reply Inline Actions We normally shouldn't be creating virtual registers here; isel lowering will create copies when necessary. Maybe this is getting weird because of the use of getMachineNode()? Normally, we shouldn't be using getMachineNode() during ISelLowering; we should use something like `DAG.getNode(AArch64ISD::XPACI, [...]`, and lower that to the actual instruction during ISelDAGToDAG. efriedma: We normally shouldn't be creating virtual registers here; isel lowering will create copies when…
		atrosinenkoAuthorUnsubmitted Not Done Reply Inline Actions Thank you for the suggestion, this reload to another register class looked awkward to me, too. atrosinenko: Thank you for the suggestion, this reload to another register class looked awkward to me, too.
} else {		} else {
// XPACLRI operates on LR therefore we must move the operand accordingly.		// XPACLRI operates on LR therefore we must move the operand accordingly.
SDValue Chain =		SDValue Chain =
DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);		DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);		St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
}		}
return SDValue(St, 0);		return SDValue(St, 0);
}		}
▲ Show 20 Lines • Show All 16,280 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,010 Lines • ▼ Show 20 Lines	if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
// Insert the copy from LR/X30 into the entry block, before it can be		// Insert the copy from LR/X30 into the entry block, before it can be
// clobbered by anything.		// clobbered by anything.
MFI.setReturnAddressIsTaken(true);		MFI.setReturnAddressIsTaken(true);
MFReturnAddr = getFunctionLiveInPhysReg(		MFReturnAddr = getFunctionLiveInPhysReg(
MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());		MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
}		}

if (STI.hasPAuth()) {		if (STI.hasPAuth()) {
MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});		Register TmpReg =
		MRI.createVirtualRegister(&AArch64::GPR64noipRegClass);
		MIB.buildCopy(TmpReg, MFReturnAddr);
		MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
} else {		} else {
MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});		MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
MIB.buildInstr(AArch64::XPACLRI);		MIB.buildInstr(AArch64::XPACLRI);
MIB.buildCopy({DstReg}, {Register(AArch64::LR)});		MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
}		}

I.eraseFromParent();		I.eraseFromParent();
return true;		return true;
▲ Show 20 Lines • Show All 1,070 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/aarch64-signedreturnaddress.ll

	; RUN: llc < %s -mtriple=arm64-eabi -asm-verbose=false -mattr=v8.2a \| FileCheck %s			; RUN: llc < %s -mtriple=arm64-eabi -asm-verbose=false -mattr=v8.2a \| FileCheck %s
	; RUN: llc < %s -mtriple=arm64-eabi -asm-verbose=false -mattr=v8.3a \| FileCheck %s --check-prefix=CHECKV83			; RUN: llc < %s -mtriple=arm64-eabi -asm-verbose=false -mattr=v8.3a \| FileCheck %s --check-prefix=CHECKV83
				; RUN: llc < %s -mtriple=arm64-eabi -asm-verbose=false -mattr=v8.2a -global-isel=1 -global-isel-abort=1 \| FileCheck %s
				; RUN: llc < %s -mtriple=arm64-eabi -asm-verbose=false -mattr=v8.3a -global-isel=1 -global-isel-abort=1 \| FileCheck %s --check-prefix=CHECKV83

	; Armv8.3-A Pointer Authetication requires a special intsruction to strip the			; Armv8.3-A Pointer Authetication requires a special intsruction to strip the
				kristof.beylsUnsubmitted Not Done Reply Inline Actions nit: you might be able to spell-correct intsruction -> instruction here while touching this file anyway? kristof.beyls: nit: you might be able to spell-correct intsruction -> instruction here while touching this…
	; pointer authentication code from the pointer.			; pointer authentication code from the pointer.
	; The XPACLRI instruction assembles to a hint-space instruction before Armv8.3-A			; The XPACLRI instruction assembles to a hint-space instruction before Armv8.3-A
	; therefore this instruction can be safely used for any pre Armv8.3-A architectures.			; therefore this instruction can be safely used for any pre Armv8.3-A architectures.
	; On Armv8.3-A and onwards XPACI is available so use that instead.			; On Armv8.3-A and onwards XPACI is available so use that instead.

	define ptr @ra0() nounwind readnone {			define ptr @retaddr_0() nounwind readnone "sign-return-address"="none" {
	entry:			entry:
	; CHECK-LABEL: ra0:			; CHECK-LABEL: retaddr_0:
	; CHECK-NEXT: str x30, [sp, #-16]!			; CHECK-NEXT: str x30, [sp, #-16]!
	; CHECK-NEXT: hint #7			; CHECK-NEXT: hint #7
	; CHECK-NEXT: mov x0, x30			; CHECK-NEXT: mov x0, x30
	; CHECK-NEXT: ldr x30, [sp], #16			; CHECK-NEXT: ldr x30, [sp], #16
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	; CHECKV83: str x30, [sp, #-16]!			;
	; CHECKV83-NEXT: xpaci x30			; CHECKV83-LABEL: retaddr_0:
	; CHECKV83-NEXT: mov x0, x30			; CHECKV83: mov x0, x30
	; CHECKV83-NEXT: ldr x30, [sp], #16			; CHECKV83-NEXT: xpaci x0
				kristof.beylsUnsubmitted Not Done Reply Inline Actions Since this function doesn't sign return addresses, the xpaci is not needed, IIUC? Probably that is yet another optimization that could be implemented. Maybe it's worthwhile to add a FIXME for this here or in AArch64TargetLowering::LowerRETURNADDR? kristof.beyls: Since this function doesn't sign return addresses, the xpaci is not needed, IIUC? Probably that…
	; CHECKV83-NEXT: ret			; CHECKV83-NEXT: ret
	%0 = tail call ptr @llvm.returnaddress(i32 0)			%0 = tail call ptr @llvm.returnaddress(i32 0)
	ret ptr %0			ret ptr %0
	}			}

	define ptr @ra1() nounwind readnone #0 {			define ptr @retaddr_sign_all_0() nounwind readnone "sign-return-address"="all" {
	entry:			entry:
	; CHECK-LABEL: ra1:			; CHECK-LABEL: retaddr_sign_all_0:
	; CHECK: hint #25			; CHECK-NEXT: hint #25
	; CHECK-NEXT: str x30, [sp, #-16]!			; CHECK-NEXT: str x30, [sp, #-16]!
	; CHECK-NEXT: hint #7			; CHECK-NEXT: hint #7
	; CHECK-NEXT: mov x0, x30			; CHECK-NEXT: mov x0, x30
	; CHECK-NEXT: ldr x30, [sp], #16			; CHECK-NEXT: ldr x30, [sp], #16
	; CHECK-NEXT: hint #29			; CHECK-NEXT: hint #29
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	; CHECKV83: paciasp			;
	; CHECKV83-NEXT: str x30, [sp, #-16]!			; CHECKV83-LABEL: retaddr_sign_all_0:
	; CHECKV83-NEXT: xpaci x30			; CHECKV83-NEXT: paciasp
	; CHECKV83-NEXT: mov x0, x30			; CHECKV83-NEXT: mov x0, x30
	; CHECKV83-NEXT: ldr x30, [sp], #16			; CHECKV83-NEXT: xpaci x0
	; CHECKV83-NEXT: retaa			; CHECKV83-NEXT: retaa
	%0 = tail call ptr @llvm.returnaddress(i32 0)			%0 = tail call ptr @llvm.returnaddress(i32 0)
	ret ptr %0			ret ptr %0
	}			}

	attributes #0 = { "sign-return-address"="all" }			; x29 and x30 are saved because @llvm.returnaddress with non-zero argument
				; takes frame address internally.

				define ptr @retaddr_1() nounwind readnone "sign-return-address"="none" {
				entry:
				; CHECK-LABEL: retaddr_1:
				; CHECK-NEXT: stp x29, x30, [sp, #-16]!
				; CHECK-NEXT: mov x29, sp
				; CHECK-NEXT: ldr x8, [x29]
				; CHECK-NEXT: ldr x30, [x8, #8]
				; CHECK-NEXT: hint #7
				; CHECK-NEXT: mov x0, x30
				; CHECK-NEXT: ldp x29, x30, [sp], #16
				; CHECK-NEXT: ret
				;
				; CHECKV83-LABEL: retaddr_1:
				; CHECKV83-NEXT: stp x29, x30, [sp, #-16]!
				; CHECKV83-NEXT: mov x29, sp
				; CHECKV83-NEXT: ldr x8, [x29]
				; CHECKV83-NEXT: ldr x0, [x8, #8]
				; CHECKV83-NEXT: xpaci x0
				; CHECKV83-NEXT: ldp x29, x30, [sp], #16
				; CHECKV83-NEXT: ret
				%0 = tail call ptr @llvm.returnaddress(i32 1)
				ret ptr %0
				}

				define ptr @retaddr_sign_all_1() nounwind readnone "sign-return-address"="all" {
				entry:
				; CHECK-LABEL: retaddr_sign_all_1:
				; CHECK-NEXT: hint #25
				; CHECK-NEXT: stp x29, x30, [sp, #-16]!
				; CHECK-NEXT: mov x29, sp
				; CHECK-NEXT: ldr x8, [x29]
				; CHECK-NEXT: ldr x30, [x8, #8]
				; CHECK-NEXT: hint #7
				; CHECK-NEXT: mov x0, x30
				; CHECK-NEXT: ldp x29, x30, [sp], #16
				; CHECK-NEXT: hint #29
				; CHECK-NEXT: ret
				;
				; CHECKV83-LABEL: retaddr_sign_all_1:
				; CHECKV83-NEXT: paciasp
				; CHECKV83-NEXT: stp x29, x30, [sp, #-16]!
				; CHECKV83-NEXT: mov x29, sp
				; CHECKV83-NEXT: ldr x8, [x29]
				; CHECKV83-NEXT: ldr x0, [x8, #8]
				; CHECKV83-NEXT: xpaci x0
				; CHECKV83-NEXT: ldp x29, x30, [sp], #16
				; CHECKV83-NEXT: retaa

				%0 = tail call ptr @llvm.returnaddress(i32 1)
				ret ptr %0
				}

	declare ptr @llvm.returnaddress(i32) nounwind readnone			declare ptr @llvm.returnaddress(i32) nounwind readnone