This is an archive of the discontinued LLVM Phabricator instance.

[AArch64][SME] Always allocate a lazy-save buffer if a function has ZA state.
ClosedPublic

Authored by sdesmalen on Nov 17 2022, 7:03 AM.

Download Raw Diff

Details

Reviewers

paulwalker-arm
kmclaughlin
aemerson

Commits

rG3f9d64a2adc5: [AArch64][SME] Always allocate a lazy-save buffer if a function has ZA state.

Summary

We already do this for most cases, with the exception of instructions that
get expanded to function calls (e.g. for lowering operations on fp128
values), in which case we temporarily allocate a lazy-save buffer.

The code that is generated in this case, is however incorrect, as it seems
to pass an incorrect address for the TPIDR2 object to the ZA restore
function. By always allocating the lazy-save buffer once, we avoid this
issue entirely.

The cost is that we also allocate such a buffer when it is not
needed. We could fix that in a follow-up patch, where we remove the
lazy-save buffer when it isn't used.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

sdesmalen created this revision.Nov 17 2022, 7:03 AM

Herald added a project: Restricted Project. · View Herald TranscriptNov 17 2022, 7:03 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

sdesmalen requested review of this revision.Nov 17 2022, 7:03 AM

Herald added a project: Restricted Project. · View Herald TranscriptNov 17 2022, 7:03 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B198203: Diff 476116.Nov 17 2022, 7:03 AM

paulwalker-arm accepted this revision.Nov 18 2022, 3:51 AM

paulwalker-arm added inline comments.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
6431–6432	What about "Conservatively assume the function requires the lazy-save mechanism."?

This revision is now accepted and ready to land.Nov 18 2022, 3:51 AM

This revision was landed with ongoing or failed builds.Nov 21 2022, 8:33 AM

Closed by commit rG3f9d64a2adc5: [AArch64][SME] Always allocate a lazy-save buffer if a function has ZA state. (authored by sdesmalen). · Explain Why

This revision was automatically updated to reflect the committed changes.

sdesmalen added a commit: rG3f9d64a2adc5: [AArch64][SME] Always allocate a lazy-save buffer if a function has ZA state..

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

22 lines

test/

CodeGen/

AArch64/

sme-disable-gisel-fisel.ll

4 lines

sme-shared-za-interface.ll

4 lines

Diff 476921

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,007 Lines • ▼ Show 20 Lines

CCAssignFn *		CCAssignFn *
AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {		AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS		return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;		: RetCC_AArch64_AAPCS;
}		}


/// Returns true if the Function has ZA state and contains at least one call to
/// a function that requires setting up a lazy-save buffer.
static bool requiresBufferForLazySave(const Function &F) {
SMEAttrs CallerAttrs(F);
if (!CallerAttrs.hasZAState())
return false;

for (const BasicBlock &BB : F)
for (const Instruction &I : BB)
if (const CallInst *Call = dyn_cast<CallInst>(&I))
if (CallerAttrs.requiresLazySave(SMEAttrs(*Call)))
return true;
return false;
}

unsigned		unsigned
AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,		AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();		MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();		MachineFrameInfo &MFI = MF.getFrameInfo();

// Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)		// Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,		SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
▲ Show 20 Lines • Show All 399 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::LowerFormalArguments(
}		}
// Even if we're not expected to free up the space, it's useful to know how		// Even if we're not expected to free up the space, it's useful to know how
// much is there while considering tail calls (because we can reuse it).		// much is there while considering tail calls (because we can reuse it).
FuncInfo->setBytesInStackArgArea(StackArgSize);		FuncInfo->setBytesInStackArgArea(StackArgSize);

if (Subtarget->hasCustomCallingConv())		if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);		Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);

if (requiresBufferForLazySave(MF.getFunction())) {		// Conservatively assume the function requires the lazy-save mechanism.
// Set up a buffer once and store the buffer in the MachineFunctionInfo.		if (SMEAttrs(MF.getFunction()).hasZAState()) {
		paulwalker-armUnsubmitted Not Done Reply Inline Actions What about "Conservatively assume the function requires the lazy-save mechanism."? paulwalker-arm: What about "Conservatively assume the function requires the lazy-save mechanism."?
unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);		unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);		FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
}		}

return Chain;		return Chain;
}		}

void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,		void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
▲ Show 20 Lines • Show All 580 Lines • ▼ Show 20 Lines	AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (RequiresLazySave) {		if (RequiresLazySave) {
// Set up a lazy save mechanism by storing the runtime live slices		// Set up a lazy save mechanism by storing the runtime live slices
// (worst-case N*N) to the TPIDR2 stack object.		// (worst-case N*N) to the TPIDR2 stack object.
SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,		SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
DAG.getConstant(1, DL, MVT::i32));		DAG.getConstant(1, DL, MVT::i32));
SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);		SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();		unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();

if (!TPIDR2Obj)
TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);

MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);		MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,		SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));		DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
SDValue BufferPtrAddr =		SDValue BufferPtrAddr =
DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,		DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));		DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16);		Chain = DAG.getTruncStore(Chain, DL, NN, BufferPtrAddr, MPI, MVT::i16);
Chain = DAG.getNode(		Chain = DAG.getNode(
▲ Show 20 Lines • Show All 16,345 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll

	Show First 20 Lines • Show All 304 Lines • ▼ Show 20 Lines
	; CHECK-COMMON-NEXT: mov x29, sp			; CHECK-COMMON-NEXT: mov x29, sp
	; CHECK-COMMON-NEXT: sub sp, sp, #16			; CHECK-COMMON-NEXT: sub sp, sp, #16
	; CHECK-COMMON-NEXT: rdsvl x8, #1			; CHECK-COMMON-NEXT: rdsvl x8, #1
	; CHECK-COMMON-NEXT: mov x9, sp			; CHECK-COMMON-NEXT: mov x9, sp
	; CHECK-COMMON-NEXT: mul x8, x8, x8			; CHECK-COMMON-NEXT: mul x8, x8, x8
	; CHECK-COMMON-NEXT: sub x9, x9, x8			; CHECK-COMMON-NEXT: sub x9, x9, x8
	; CHECK-COMMON-NEXT: mov sp, x9			; CHECK-COMMON-NEXT: mov sp, x9
	; CHECK-COMMON-NEXT: sub x10, x29, #16			; CHECK-COMMON-NEXT: sub x10, x29, #16
	; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
	; CHECK-COMMON-NEXT: stur x9, [x29, #-16]			; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
				; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
	; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10			; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10
	; CHECK-COMMON-NEXT: bl __addtf3			; CHECK-COMMON-NEXT: bl __addtf3
	; CHECK-COMMON-NEXT: smstart za			; CHECK-COMMON-NEXT: smstart za
	; CHECK-COMMON-NEXT: add x0, x29, #0			; CHECK-COMMON-NEXT: sub x0, x29, #16
	; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0			; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
	; CHECK-COMMON-NEXT: cbnz x8, .LBB8_2			; CHECK-COMMON-NEXT: cbnz x8, .LBB8_2
	; CHECK-COMMON-NEXT: // %bb.1:			; CHECK-COMMON-NEXT: // %bb.1:
	; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore			; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore
	; CHECK-COMMON-NEXT: .LBB8_2:			; CHECK-COMMON-NEXT: .LBB8_2:
	; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr			; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
	; CHECK-COMMON-NEXT: mov sp, x29			; CHECK-COMMON-NEXT: mov sp, x29
	; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload			; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
	Show All 33 Lines

llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll

	Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: mov x29, sp			; CHECK-NEXT: mov x29, sp
	; CHECK-NEXT: sub sp, sp, #16			; CHECK-NEXT: sub sp, sp, #16
	; CHECK-NEXT: rdsvl x8, #1			; CHECK-NEXT: rdsvl x8, #1
	; CHECK-NEXT: mov x9, sp			; CHECK-NEXT: mov x9, sp
	; CHECK-NEXT: mul x8, x8, x8			; CHECK-NEXT: mul x8, x8, x8
	; CHECK-NEXT: sub x9, x9, x8			; CHECK-NEXT: sub x9, x9, x8
	; CHECK-NEXT: mov sp, x9			; CHECK-NEXT: mov sp, x9
	; CHECK-NEXT: sub x10, x29, #16			; CHECK-NEXT: sub x10, x29, #16
	; CHECK-NEXT: sturh w8, [x29, #-8]
	; CHECK-NEXT: stur x9, [x29, #-16]			; CHECK-NEXT: stur x9, [x29, #-16]
				; CHECK-NEXT: sturh w8, [x29, #-8]
	; CHECK-NEXT: msr TPIDR2_EL0, x10			; CHECK-NEXT: msr TPIDR2_EL0, x10
	; CHECK-NEXT: bl __addtf3			; CHECK-NEXT: bl __addtf3
	; CHECK-NEXT: smstart za			; CHECK-NEXT: smstart za
	; CHECK-NEXT: add x0, x29, #0			; CHECK-NEXT: sub x0, x29, #16
	; CHECK-NEXT: mrs x8, TPIDR2_EL0			; CHECK-NEXT: mrs x8, TPIDR2_EL0
	; CHECK-NEXT: cbnz x8, .LBB1_2			; CHECK-NEXT: cbnz x8, .LBB1_2
	; CHECK-NEXT: // %bb.1:			; CHECK-NEXT: // %bb.1:
	; CHECK-NEXT: bl __arm_tpidr2_restore			; CHECK-NEXT: bl __arm_tpidr2_restore
	; CHECK-NEXT: .LBB1_2:			; CHECK-NEXT: .LBB1_2:
	; CHECK-NEXT: msr TPIDR2_EL0, xzr			; CHECK-NEXT: msr TPIDR2_EL0, xzr
	; CHECK-NEXT: mov sp, x29			; CHECK-NEXT: mov sp, x29
	; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload			; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%res = fadd fp128 %a, %b			%res = fadd fp128 %a, %b
	ret fp128 %res			ret fp128 %res
	}			}